# Function to check and install packages if not already installed
install_if_missing <- function(packages) {
for (pkg in packages) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg, dependencies = TRUE)
}
library(pkg, character.only = TRUE)
}
}
# List of necessary packages
packages <- c("RSQLite", "ggplot2", "tidyr", "dplyr", "plotly", "sf", "viridis", "rmapshaper", "rlang", "scales", "gridExtra")
# Install and load the necessary packages
install_if_missing(packages)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
## Loading required package: viridisLite
##
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
##
## viridis_pal
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Only run the next codes if being run for the first time
# Adjusted path for the database
# con <- dbConnect(RSQLite::SQLite(), "/Users/mellogwayo/Desktop/Household Financial Health Analytics Platform/database/variables for analysis.db")
# 'data' is the table name within the SQLite database
# df <- dbReadTable(con, "data")
# Close the connection
# dbDisconnect(con)
# Original variables CSV file path updated
# write.csv(df, "/Users/mellogwayo/Desktop/Household Financial Health Analytics Platform/data/processed/variables.csv", row.names = FALSE)
# Loading the data from the CSV file into a DataFrame
data <- read.csv("/Users/mellogwayo/Desktop/Household Financial Health Analytics Platform/data/processed/transformed_variables.csv")
# View the first few rows of the data to confirm it's loaded correctly
head(data)
## pidp jkl_hidp j_hidp k_hidp l_hidp jkl_pno jkl_mnpno jkl_fnpno
## 1 68008847 68013622 -14 -14 68013622 1 not in hh not in hh
## 2 68010887 68027222 -14 -14 68027222 1 not in hh not in hh
## 3 68068082 68027222 -14 -14 68027222 2 not in hh not in hh
## 4 68031967 68081622 -14 -14 68081622 1 not in hh not in hh
## 5 68036727 68095222 -14 -14 68095222 1 not in hh not in hh
## 6 68037407 68102022 -14 -14 68102022 1 not in hh not in hh
## jkl_mnpid jkl_fnpid jkl_mnspno jkl_fnspno jkl_mnspid jkl_fnspid
## 1 inapplicable inapplicable not in hh not in hh inapplicable inapplicable
## 2 inapplicable inapplicable not in hh not in hh inapplicable inapplicable
## 3 inapplicable inapplicable not in hh not in hh inapplicable inapplicable
## 4 inapplicable inapplicable not in hh not in hh inapplicable inapplicable
## 5 inapplicable inapplicable not in hh not in hh inapplicable inapplicable
## 6 inapplicable inapplicable not in hh not in hh inapplicable inapplicable
## jkl_grmpno jkl_grfpno jkl_childpno jkl_country jkl_gor_dv jkl_urban_dv
## 1 not in hh not in hh inapplicable England North East urban area
## 2 not in hh not in hh inapplicable England North East urban area
## 3 not in hh not in hh inapplicable England North East urban area
## 4 not in hh not in hh inapplicable England West Midlands urban area
## 5 not in hh not in hh inapplicable England North East urban area
## 6 not in hh not in hh inapplicable England North East urban area
## jkl_sex_dv jkl_mastat_dv
## 1 Female Widowed
## 2 Female Married
## 3 Male Married
## 4 Female Widowed
## 5 Male Widowed
## 6 Female Single and never married/in civil partnership
## jkl_jbstat jkl_ethn_dv
## 1 Paid employment(ft/pt) british/english/scottish/welsh/northern irish
## 2 Paid employment(ft/pt) british/english/scottish/welsh/northern irish
## 3 Self employed british/english/scottish/welsh/northern irish
## 4 Retired british/english/scottish/welsh/northern irish
## 5 Retired british/english/scottish/welsh/northern irish
## 6 Paid employment(ft/pt) british/english/scottish/welsh/northern irish
## bornuk_dv jkl_jbsoc00_cc
## 1 born in uk Administrative occupations: general
## 2 born in uk Healthcare and related personal services
## 3 born in uk Construction trades
## 4 born in uk inapplicable
## 5 born in uk inapplicable
## 6 born in uk Assemblers and routine operatives
## jkl_jbnssec8_dv jkl_jbnssec5_dv jkl_jbnssec3_dv
## 1 Intermediate Intermediate Intermediate
## 2 Semi-routine Semi-routine & routine Routine
## 3 Small employers & own account Small employers & own account Intermediate
## 4 inapplicable inapplicable inapplicable
## 5 inapplicable inapplicable inapplicable
## 6 Routine Semi-routine & routine Routine
## jkl_fimnnet_dv jkl_health jkl_nkids_dv
## 1 2288.00 Yes none
## 2 1200.00 Yes none
## 3 3275.91 No none
## 4 1282.17 Yes none
## 5 1456.17 Yes none
## 6 1186.31 No none
## jkl_hhtype_dv
## 1 1 female, age 60+, no children
## 2 Couple both under pensionable age, no children
## 3 Couple both under pensionable age, no children
## 4 1 female, age 60+, no children
## 5 1 male, aged 65+, no children
## 6 2 adults, not a couple, both under pensionable age, no children
## jkl_tenure_dv jkl_fihhmnnet1_dv
## 1 Owned with mortgage 2288.00
## 2 Owned with mortgage 4475.91
## 3 Owned with mortgage 4475.91
## 4 Local authority rent 1282.17
## 5 missing 1456.17
## 6 Rented private unfurnished 2446.63
ggplot(data, aes(x = jkl_country)) +
geom_bar() +
labs(title = "Distribution of Sample Members by Country", x = "Country", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Most of the respondents in this study are from England.
ggplot(data, aes(x = jkl_gor_dv)) +
geom_bar() +
labs(
title = "Distribution of Members by Region",
x = "Region",
y = "Count"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The South East, London, North west and Scotland were the regions that recorded the most respondents.
The North East had a significantly lower number of respondents.
# Calculate percentages for the employment status
employment_status <- data %>%
count(jkl_jbstat) %>%
mutate(percentage = n / sum(n) * 100)
employment_status
## jkl_jbstat n percentage
## 1 Doing something else 101 0.53046218
## 2 Family care or home 399 2.09558824
## 3 Full-time student 117 0.61449580
## 4 Govt training scheme 2 0.01050420
## 5 LT sick or disabled 716 3.76050420
## 6 On apprenticeship 6 0.03151261
## 7 On furlough 146 0.76680672
## 8 On maternity leave 15 0.07878151
## 9 Paid employment(ft/pt) 7527 39.53256303
## 10 Retired 8040 42.22689076
## 11 Self employed 1377 7.23214286
## 12 Temporarily laid off/short term working 19 0.09978992
## 13 Unemployed 536 2.81512605
## 14 Unpaid, family business 13 0.06827731
## 15 don't know 8 0.04201681
## 16 refusal 18 0.09453782
ggplot(data, aes(x = "", fill = jkl_jbstat)) +
geom_bar(width = 1) +
coord_polar(theta = "y") +
labs(title = "Employment Status Distribution", x = NULL, y = NULL, fill = "Employment Status") +
theme_void() +
theme(legend.title = element_text(size = 12), legend.text = element_text(size = 10))
42% of the respondents are retired whilst 39% of them are in paid employment. Other categories like long-term sickness or disability, family care, and being a student represent smaller but notable percentages of the whole data set.
ggplot(data, aes(x = jkl_tenure_dv)) +
geom_bar(fill = "steelblue") +
labs(title = "Housing Tenure of Respondents", x = "Housing Tenure", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
A significant proportion of the population in this data set own their homes.
numeric_variables <- c("jkl_fimnnet_dv", "jkl_fihhmnnet1_dv")
summary_statistics_list <- list()
for (var in numeric_variables) {
summary_statistics <- data %>%
summarise(
Mean = mean(.data[[var]], na.rm = TRUE),
Median = median(.data[[var]], na.rm = TRUE),
StdDev = sd(.data[[var]], na.rm = TRUE),
Min = min(.data[[var]], na.rm = TRUE),
Max = max(.data[[var]], na.rm = TRUE)
)
print(paste("Summary statistics for", var))
print(summary_statistics)
summary_statistics_list[[var]] <- summary_statistics
hist_plot <- ggplot(data, aes(x = !!sym(var))) +
geom_histogram(binwidth = 100, fill = 'blue') +
labs(title = paste('Histogram of', var), x = var, y = 'Count') +
scale_x_continuous(limits = c(0, 30000)) # Adjust the limits as needed
print(hist_plot)
}
## [1] "Summary statistics for jkl_fimnnet_dv"
## Mean Median StdDev Min Max
## 1 1727.58 1484.97 1596.673 -5589.49 102890.9
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## [1] "Summary statistics for jkl_fihhmnnet1_dv"
## Mean Median StdDev Min Max
## 1 3369.379 2830.57 2633.771 -3306.16 104318.9
## Warning: Removed 9 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
The mean total net personal income (jkl_fimnnet_dv) and the mean household income (jkl_fihhmnnet1_dv) are higher than their medians. This indicates that the distribution might be right-skewed. Incomes on the higher end are pulling the mean upwards.
The large standard deviation (1596.673) relative to the mean suggests there is significant variation in the total net personal income and household income levels.
The income range is very wide, from -5589.49 to 102890.9 for the total net personal income and from -3306.16 to 104318.9 for the household income. This wide range indicates significant disparities in the dataset, with some individuals having very high incomes and others experiencing losses or debts. It also shows that the dataset has extreme values or outliers. There is a diverse set of incomes, including some cases of significant financial difficulties and some cases of substantial wealth.
correlation_matrix <- data %>%
select(all_of(numeric_variables)) %>%
cor(use = "complete.obs")
print(correlation_matrix)
## jkl_fimnnet_dv jkl_fihhmnnet1_dv
## jkl_fimnnet_dv 1.0000000 0.6162484
## jkl_fihhmnnet1_dv 0.6162484 1.0000000
There is a moderate to strong positive relationship between personal incomes and household incomes. The relationship is strong enough to suggest that higher personal incomes are associated with higher household incomes.
categorical_variables <- c("jkl_country", "jkl_gor_dv", "jkl_urban_dv", "jkl_sex_dv",
"jkl_mastat_dv", "jkl_jbstat", "jkl_ethn_dv", "bornuk_dv",
"jkl_jbsoc00_cc", "jkl_jbnssec8_dv", "jkl_jbnssec5_dv",
"jkl_jbnssec3_dv", "jkl_health", "jkl_nkids_dv", "jkl_hhtype_dv",
"jkl_tenure_dv")
data_factors <- data %>%
mutate(across(all_of(categorical_variables), factor))
for (var in categorical_variables) {
cat("\nFrequency table for", var, ":\n")
frequency_counts <- table(data_factors[[var]])
print(frequency_counts)
proportions <- prop.table(frequency_counts)
cat("\nProportions for", var, ":\n")
print(proportions)
bar_plot <- ggplot(data_factors, aes(x = !!sym(var))) +
geom_bar() +
labs(title = paste("Distribution of", var), x = var, y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(bar_plot)
}
##
## Frequency table for jkl_country :
##
## England missing Northern Ireland Scotland
## 14836 5 1146 1770
## Wales
## 1283
##
## Proportions for jkl_country :
##
## England missing Northern Ireland Scotland
## 0.779201681 0.000262605 0.060189076 0.092962185
## Wales
## 0.067384454
##
## Frequency table for jkl_gor_dv :
##
## East Midlands East of England London
## 1453 1642 1954
## missing North East North West
## 5 728 1813
## Northern Ireland Scotland South East
## 1146 1770 2425
## South West Wales West Midlands
## 1702 1283 1524
## Yorkshire and the Humber
## 1595
##
## Proportions for jkl_gor_dv :
##
## East Midlands East of England London
## 0.076313025 0.086239496 0.102626050
## missing North East North West
## 0.000262605 0.038235294 0.095220588
## Northern Ireland Scotland South East
## 0.060189076 0.092962185 0.127363445
## South West Wales West Midlands
## 0.089390756 0.067384454 0.080042017
## Yorkshire and the Humber
## 0.083771008
##
## Frequency table for jkl_urban_dv :
##
## missing rural area urban area
## 56 5198 13786
##
## Proportions for jkl_urban_dv :
##
## missing rural area urban area
## 0.002941176 0.273004202 0.724054622
##
## Frequency table for jkl_sex_dv :
##
## Female inconsistent Male missing
## 10404 1 8304 331
##
## Proportions for jkl_sex_dv :
##
## Female inconsistent Male missing
## 5.464286e-01 5.252101e-05 4.361345e-01 1.738445e-02
##
## Frequency table for jkl_mastat_dv :
##
## A former civil partner
## 10
## A surviving civil partner
## 9
## Divorced
## 1713
## don't know
## 11
## In a registered same-sex civil partnership
## 151
## Living as couple
## 1949
## Married
## 10788
## missing
## 1
## refusal
## 34
## Separated but legally married
## 269
## Separated from civil partner
## 22
## Single and never married/in civil partnership
## 2413
## Widowed
## 1670
##
## Proportions for jkl_mastat_dv :
##
## A former civil partner
## 5.252101e-04
## A surviving civil partner
## 4.726891e-04
## Divorced
## 8.996849e-02
## don't know
## 5.777311e-04
## In a registered same-sex civil partnership
## 7.930672e-03
## Living as couple
## 1.023634e-01
## Married
## 5.665966e-01
## missing
## 5.252101e-05
## refusal
## 1.785714e-03
## Separated but legally married
## 1.412815e-02
## Separated from civil partner
## 1.155462e-03
## Single and never married/in civil partnership
## 1.267332e-01
## Widowed
## 8.771008e-02
##
## Frequency table for jkl_jbstat :
##
## Doing something else don't know
## 101 8
## Family care or home Full-time student
## 399 117
## Govt training scheme LT sick or disabled
## 2 716
## On apprenticeship On furlough
## 6 146
## On maternity leave Paid employment(ft/pt)
## 15 7527
## refusal Retired
## 18 8040
## Self employed Temporarily laid off/short term working
## 1377 19
## Unemployed Unpaid, family business
## 536 13
##
## Proportions for jkl_jbstat :
##
## Doing something else don't know
## 0.0053046218 0.0004201681
## Family care or home Full-time student
## 0.0209558824 0.0061449580
## Govt training scheme LT sick or disabled
## 0.0001050420 0.0376050420
## On apprenticeship On furlough
## 0.0003151261 0.0076680672
## On maternity leave Paid employment(ft/pt)
## 0.0007878151 0.3953256303
## refusal Retired
## 0.0009453782 0.4222689076
## Self employed Temporarily laid off/short term working
## 0.0723214286 0.0009978992
## Unemployed Unpaid, family business
## 0.0281512605 0.0006827731
##
## Frequency table for jkl_ethn_dv :
##
## african
## 220
## any other asian background
## 154
## any other black background
## 29
## any other ethnic group
## 67
## any other mixed background
## 63
## any other white background
## 527
## arab
## 29
## bangladeshi
## 114
## british/english/scottish/welsh/northern irish
## 16103
## caribbean
## 313
## chinese
## 73
## gypsy or irish traveller
## 4
## indian
## 530
## irish
## 212
## missing
## 154
## pakistani
## 253
## white and asian
## 65
## white and black african
## 31
## white and black caribbean
## 99
##
## Proportions for jkl_ethn_dv :
##
## african
## 0.011554622
## any other asian background
## 0.008088235
## any other black background
## 0.001523109
## any other ethnic group
## 0.003518908
## any other mixed background
## 0.003308824
## any other white background
## 0.027678571
## arab
## 0.001523109
## bangladeshi
## 0.005987395
## british/english/scottish/welsh/northern irish
## 0.845745798
## caribbean
## 0.016439076
## chinese
## 0.003834034
## gypsy or irish traveller
## 0.000210084
## indian
## 0.027836134
## irish
## 0.011134454
## missing
## 0.008088235
## pakistani
## 0.013287815
## white and asian
## 0.003413866
## white and black african
## 0.001628151
## white and black caribbean
## 0.005199580
##
## Frequency table for bornuk_dv :
##
## born in uk missing not born in uk
## 16565 236 2239
##
## Proportions for bornuk_dv :
##
## born in uk missing not born in uk
## 0.87001050 0.01239496 0.11759454
##
## Frequency table for jkl_jbsoc00_cc :
##
## Administrative occupations: communications
## 10
## Administrative occupations: finance
## 239
## Administrative occupations: general
## 246
## Administrative occupations: government and related organisations
## 104
## Administrative occupations: records
## 161
## Agricultural trades
## 102
## Animal care services
## 25
## Architects, town planners, surveyors
## 44
## Artistic and literary occupations
## 65
## Assemblers and routine operatives
## 80
## Building trades
## 34
## Business and finance associate professionals
## 261
## Business and statistical professionals
## 110
## Childcare and related personal services
## 251
## Conservation associate professionals
## 12
## Construction operatives
## 30
## Construction trades
## 142
## Corporate managers and senior officials
## 44
## Customer service occupations
## 111
## Design associate professionals
## 56
## don't know
## 13
## Draughtspersons and building inspectors
## 19
## Electrical trades
## 84
## Elementary administration occupations
## 76
## Elementary agricultural occupations
## 18
## Elementary cleaning occupations
## 216
## Elementary construction occupations
## 37
## Elementary goods storage occupations
## 81
## Elementary personal services occupations
## 138
## Elementary process plant occupations
## 49
## Elementary sales occupations
## 29
## Elementary security occupations
## 97
## Engineering professionals
## 108
## Financial institution and office managers
## 152
## Food preparation trades
## 86
## Functional managers
## 386
## Hairdressers and related occupations
## 36
## Health and social services managers
## 70
## Health associate professionals
## 307
## Health professionals
## 141
## Healthcare and related personal services
## 449
## Housekeeping occupations
## 41
## inapplicable
## 10154
## Information and communication technology professionals
## 149
## IT service delivery occupations
## 81
## Legal associate professionals
## 27
## Legal professionals
## 61
## Leisure and travel service occupations
## 45
## Librarians and related professionals
## 16
## Managers and proprietors in hospitality and leisure services
## 69
## Managers and proprietors in other service industries
## 322
## Managers in distribution, storage and retailing
## 121
## Managers in farming, horticulture, forestry and services
## 19
## Media associate professionals
## 57
## Metal forming, welding and related trades
## 18
## Metal machining, fitting and instrument making trades
## 91
## missing
## 364
## Mobile machine drivers and operatives
## 28
## Personal services occupations nec
## 14
## Plant and machine operatives
## 56
## Printing trades
## 12
## Process operatives
## 46
## Production managers
## 127
## Protective service occupations
## 81
## Protective service officers
## 11
## Public service and other associate professionals
## 173
## Public service professionals
## 86
## Quality and customer care managers
## 53
## refusal
## 49
## Research professionals
## 54
## Sales and related associate professionals
## 136
## Sales assistants and retail cashiers
## 308
## Sales related occupations
## 40
## Science and engineering technicians
## 85
## Science professionals
## 33
## Secretarial and related occupations
## 204
## Skilled trades nec
## 26
## Social welfare associate professionals
## 123
## Sports and fitness occupations
## 31
## Teaching professionals
## 481
## Textiles and garments trades
## 17
## Therapists
## 87
## Transport associate professionals
## 25
## Transport drivers and operatives
## 264
## Vehicle trades
## 36
##
## Proportions for jkl_jbsoc00_cc :
##
## Administrative occupations: communications
## 0.0005252101
## Administrative occupations: finance
## 0.0125525210
## Administrative occupations: general
## 0.0129201681
## Administrative occupations: government and related organisations
## 0.0054621849
## Administrative occupations: records
## 0.0084558824
## Agricultural trades
## 0.0053571429
## Animal care services
## 0.0013130252
## Architects, town planners, surveyors
## 0.0023109244
## Artistic and literary occupations
## 0.0034138655
## Assemblers and routine operatives
## 0.0042016807
## Building trades
## 0.0017857143
## Business and finance associate professionals
## 0.0137079832
## Business and statistical professionals
## 0.0057773109
## Childcare and related personal services
## 0.0131827731
## Conservation associate professionals
## 0.0006302521
## Construction operatives
## 0.0015756303
## Construction trades
## 0.0074579832
## Corporate managers and senior officials
## 0.0023109244
## Customer service occupations
## 0.0058298319
## Design associate professionals
## 0.0029411765
## don't know
## 0.0006827731
## Draughtspersons and building inspectors
## 0.0009978992
## Electrical trades
## 0.0044117647
## Elementary administration occupations
## 0.0039915966
## Elementary agricultural occupations
## 0.0009453782
## Elementary cleaning occupations
## 0.0113445378
## Elementary construction occupations
## 0.0019432773
## Elementary goods storage occupations
## 0.0042542017
## Elementary personal services occupations
## 0.0072478992
## Elementary process plant occupations
## 0.0025735294
## Elementary sales occupations
## 0.0015231092
## Elementary security occupations
## 0.0050945378
## Engineering professionals
## 0.0056722689
## Financial institution and office managers
## 0.0079831933
## Food preparation trades
## 0.0045168067
## Functional managers
## 0.0202731092
## Hairdressers and related occupations
## 0.0018907563
## Health and social services managers
## 0.0036764706
## Health associate professionals
## 0.0161239496
## Health professionals
## 0.0074054622
## Healthcare and related personal services
## 0.0235819328
## Housekeeping occupations
## 0.0021533613
## inapplicable
## 0.5332983193
## Information and communication technology professionals
## 0.0078256303
## IT service delivery occupations
## 0.0042542017
## Legal associate professionals
## 0.0014180672
## Legal professionals
## 0.0032037815
## Leisure and travel service occupations
## 0.0023634454
## Librarians and related professionals
## 0.0008403361
## Managers and proprietors in hospitality and leisure services
## 0.0036239496
## Managers and proprietors in other service industries
## 0.0169117647
## Managers in distribution, storage and retailing
## 0.0063550420
## Managers in farming, horticulture, forestry and services
## 0.0009978992
## Media associate professionals
## 0.0029936975
## Metal forming, welding and related trades
## 0.0009453782
## Metal machining, fitting and instrument making trades
## 0.0047794118
## missing
## 0.0191176471
## Mobile machine drivers and operatives
## 0.0014705882
## Personal services occupations nec
## 0.0007352941
## Plant and machine operatives
## 0.0029411765
## Printing trades
## 0.0006302521
## Process operatives
## 0.0024159664
## Production managers
## 0.0066701681
## Protective service occupations
## 0.0042542017
## Protective service officers
## 0.0005777311
## Public service and other associate professionals
## 0.0090861345
## Public service professionals
## 0.0045168067
## Quality and customer care managers
## 0.0027836134
## refusal
## 0.0025735294
## Research professionals
## 0.0028361345
## Sales and related associate professionals
## 0.0071428571
## Sales assistants and retail cashiers
## 0.0161764706
## Sales related occupations
## 0.0021008403
## Science and engineering technicians
## 0.0044642857
## Science professionals
## 0.0017331933
## Secretarial and related occupations
## 0.0107142857
## Skilled trades nec
## 0.0013655462
## Social welfare associate professionals
## 0.0064600840
## Sports and fitness occupations
## 0.0016281513
## Teaching professionals
## 0.0252626050
## Textiles and garments trades
## 0.0008928571
## Therapists
## 0.0045693277
## Transport associate professionals
## 0.0013130252
## Transport drivers and operatives
## 0.0138655462
## Vehicle trades
## 0.0018907563
##
## Frequency table for jkl_jbnssec8_dv :
##
## Higher professional inapplicable
## 829 10148
## Intermediate Large employers & higher management
## 1179 442
## Lower management & professional Lower supervisory & technical
## 2652 514
## missing Routine
## 440 710
## Semi-routine Small employers & own account
## 1291 835
##
## Proportions for jkl_jbnssec8_dv :
##
## Higher professional inapplicable
## 0.04353992 0.53298319
## Intermediate Large employers & higher management
## 0.06192227 0.02321429
## Lower management & professional Lower supervisory & technical
## 0.13928571 0.02699580
## missing Routine
## 0.02310924 0.03728992
## Semi-routine Small employers & own account
## 0.06780462 0.04385504
##
## Frequency table for jkl_jbnssec5_dv :
##
## inapplicable Intermediate
## 10148 1179
## Lower supervisory & technical Management & professional
## 514 3923
## missing Semi-routine & routine
## 440 2001
## Small employers & own account
## 835
##
## Proportions for jkl_jbnssec5_dv :
##
## inapplicable Intermediate
## 0.53298319 0.06192227
## Lower supervisory & technical Management & professional
## 0.02699580 0.20603992
## missing Semi-routine & routine
## 0.02310924 0.10509454
## Small employers & own account
## 0.04385504
##
## Frequency table for jkl_jbnssec3_dv :
##
## inapplicable Intermediate Management & professional
## 10148 2014 3923
## missing Routine
## 440 2515
##
## Proportions for jkl_jbnssec3_dv :
##
## inapplicable Intermediate Management & professional
## 0.53298319 0.10577731 0.20603992
## missing Routine
## 0.02310924 0.13209034
##
## Frequency table for jkl_health :
##
## don't know No refusal Yes
## 18 10899 34 8089
##
## Proportions for jkl_health :
##
## don't know No refusal Yes
## 0.0009453782 0.5724264706 0.0017857143 0.4248424370
##
## Frequency table for jkl_nkids_dv :
##
## none
## 19040
##
## Proportions for jkl_nkids_dv :
##
## none
## 1
##
## Frequency table for jkl_hhtype_dv :
##
## 1 adult under pensionable age, no children
## 2210
## 1 adult, 1 child
## 70
## 1 adult, 2 or more children
## 9
## 1 female, age 60+, no children
## 1931
## 1 male, aged 65+, no children
## 767
## 2 adults, not a couple, 1 or more children
## 55
## 2 adults, not a couple, both under pensionable age, no children
## 467
## 2 adults, not a couple, one or more over pensionable age, no children
## 417
## 3 or more adults, 1 or more children, excl. any couples
## 24
## 3 or more adults, 1-2 children, incl. at least one couple
## 511
## 3 or more adults, no children, excl. any couples
## 283
## 3 or more adults, no children, incl. at least one couple
## 2979
## Couple 1 or more over pensionable age,no children
## 5707
## Couple both under pensionable age, no children
## 3293
## Couple with 1 child
## 260
## Couple with 2 children
## 57
##
## Proportions for jkl_hhtype_dv :
##
## 1 adult under pensionable age, no children
## 0.1160714286
## 1 adult, 1 child
## 0.0036764706
## 1 adult, 2 or more children
## 0.0004726891
## 1 female, age 60+, no children
## 0.1014180672
## 1 male, aged 65+, no children
## 0.0402836134
## 2 adults, not a couple, 1 or more children
## 0.0028886555
## 2 adults, not a couple, both under pensionable age, no children
## 0.0245273109
## 2 adults, not a couple, one or more over pensionable age, no children
## 0.0219012605
## 3 or more adults, 1 or more children, excl. any couples
## 0.0012605042
## 3 or more adults, 1-2 children, incl. at least one couple
## 0.0268382353
## 3 or more adults, no children, excl. any couples
## 0.0148634454
## 3 or more adults, no children, incl. at least one couple
## 0.1564600840
## Couple 1 or more over pensionable age,no children
## 0.2997373950
## Couple both under pensionable age, no children
## 0.1729516807
## Couple with 1 child
## 0.0136554622
## Couple with 2 children
## 0.0029936975
##
## Frequency table for jkl_tenure_dv :
##
## Housing assoc rented Local authority rent
## 1007 1265
## missing Other
## 123 95
## Owned outright Owned with mortgage
## 9777 4918
## Rented from employer Rented private furnished
## 150 299
## Rented private unfurnished
## 1406
##
## Proportions for jkl_tenure_dv :
##
## Housing assoc rented Local authority rent
## 0.052888655 0.066439076
## missing Other
## 0.006460084 0.004989496
## Owned outright Owned with mortgage
## 0.513497899 0.258298319
## Rented from employer Rented private furnished
## 0.007878151 0.015703782
## Rented private unfurnished
## 0.073844538
cat_vars_summary <- summary(data_factors)
print(cat_vars_summary)
## pidp jkl_hidp j_hidp
## Min. :5.998e+05 Min. :6.801e+07 Min. : -14
## 1st Qu.:3.675e+08 1st Qu.:3.469e+08 1st Qu.: -14
## Median :7.083e+08 Median :6.873e+08 Median : -14
## Mean :7.710e+08 Mean :7.708e+08 Mean : 40823901
## 3rd Qu.:1.157e+09 3rd Qu.:1.158e+09 3rd Qu.: -14
## Max. :1.653e+09 Max. :1.638e+09 Max. :1638126818
##
## k_hidp l_hidp jkl_pno jkl_mnpno
## Min. : -14 Min. : -14 Min. : 1.000 Length:19040
## 1st Qu.: -14 1st Qu.: -14 1st Qu.: 1.000 Class :character
## Median : -14 Median :139022622 Median : 1.000 Mode :character
## Mean : 485998506 Mean :243959456 Mean : 1.361
## 3rd Qu.:1090402120 3rd Qu.:478142022 3rd Qu.: 2.000
## Max. :1637766420 Max. :823738422 Max. :12.000
##
## jkl_fnpno jkl_mnpid jkl_fnpid jkl_mnspno
## Length:19040 Length:19040 Length:19040 Length:19040
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## jkl_fnspno jkl_mnspid jkl_fnspid jkl_grmpno
## Length:19040 Length:19040 Length:19040 Length:19040
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## jkl_grfpno jkl_childpno jkl_country
## Length:19040 Length:19040 England :14836
## Class :character Class :character missing : 5
## Mode :character Mode :character Northern Ireland: 1146
## Scotland : 1770
## Wales : 1283
##
##
## jkl_gor_dv jkl_urban_dv jkl_sex_dv
## South East :2425 missing : 56 Female :10404
## London :1954 rural area: 5198 inconsistent: 1
## North West :1813 urban area:13786 Male : 8304
## Scotland :1770 missing : 331
## South West :1702
## East of England:1642
## (Other) :7734
## jkl_mastat_dv
## Married :10788
## Single and never married/in civil partnership: 2413
## Living as couple : 1949
## Divorced : 1713
## Widowed : 1670
## Separated but legally married : 269
## (Other) : 238
## jkl_jbstat
## Retired :8040
## Paid employment(ft/pt):7527
## Self employed :1377
## LT sick or disabled : 716
## Unemployed : 536
## Family care or home : 399
## (Other) : 445
## jkl_ethn_dv bornuk_dv
## british/english/scottish/welsh/northern irish:16103 born in uk :16565
## indian : 530 missing : 236
## any other white background : 527 not born in uk: 2239
## caribbean : 313
## pakistani : 253
## african : 220
## (Other) : 1094
## jkl_jbsoc00_cc
## inapplicable :10154
## Teaching professionals : 481
## Healthcare and related personal services : 449
## Functional managers : 386
## missing : 364
## Managers and proprietors in other service industries: 322
## (Other) : 6884
## jkl_jbnssec8_dv jkl_jbnssec5_dv
## inapplicable :10148 inapplicable :10148
## Lower management & professional: 2652 Intermediate : 1179
## Semi-routine : 1291 Lower supervisory & technical: 514
## Intermediate : 1179 Management & professional : 3923
## Small employers & own account : 835 missing : 440
## Higher professional : 829 Semi-routine & routine : 2001
## (Other) : 2106 Small employers & own account: 835
## jkl_jbnssec3_dv jkl_fimnnet_dv jkl_health
## inapplicable :10148 Min. : -5589.5 don't know: 18
## Intermediate : 2014 1st Qu.: 937.7 No :10899
## Management & professional: 3923 Median : 1485.0 refusal : 34
## missing : 440 Mean : 1727.6 Yes : 8089
## Routine : 2515 3rd Qu.: 2166.7
## Max. :102890.9
##
## jkl_nkids_dv jkl_hhtype_dv
## none:19040 Couple 1 or more over pensionable age,no children :5707
## Couple both under pensionable age, no children :3293
## 3 or more adults, no children, incl. at least one couple:2979
## 1 adult under pensionable age, no children :2210
## 1 female, age 60+, no children :1931
## 1 male, aged 65+, no children : 767
## (Other) :2153
## jkl_tenure_dv jkl_fihhmnnet1_dv
## Owned outright :9777 Min. : -3306
## Owned with mortgage :4918 1st Qu.: 1772
## Rented private unfurnished:1406 Median : 2831
## Local authority rent :1265 Mean : 3369
## Housing assoc rented :1007 3rd Qu.: 4318
## Rented private furnished : 299 Max. :104319
## (Other) : 368
Participants are mainly from England but Northern Ireland, Scotland and Wales were also represented. Most participants were from urban areas. They were more than double those from rural areas. Female participants were the majority although not by a huge difference. Most participants are married. A significant number are divorced, living as a couple single and never married/in civil partnership and widowed. Majority of the participants are either retired or in paid employment. Most participants are British/ English/ Scottish/ Welsh/ Northern Irish. Other races were barely represented. Majority of the respondents were born in the UK. Most respondents had no long standing illness.
The variable jkl_jbsoc00_cc was unclear, so an alternative chart was created. The variables jkl_jbnssec8_dv, jkl_jbnssec5_dv, and jkl_jbnssec3_dv showed that most respondents marked their profession as inapplicable. Therefore, the following plots were created, excluding the inapplicable category from these variables.
# Calculate percentages for the job info
jobinfo <- data %>%
count(jkl_jbsoc00_cc) %>%
mutate(percentage = n / sum(n) * 100)
jobinfo
## jkl_jbsoc00_cc n
## 1 Administrative occupations: communications 10
## 2 Administrative occupations: finance 239
## 3 Administrative occupations: general 246
## 4 Administrative occupations: government and related organisations 104
## 5 Administrative occupations: records 161
## 6 Agricultural trades 102
## 7 Animal care services 25
## 8 Architects, town planners, surveyors 44
## 9 Artistic and literary occupations 65
## 10 Assemblers and routine operatives 80
## 11 Building trades 34
## 12 Business and finance associate professionals 261
## 13 Business and statistical professionals 110
## 14 Childcare and related personal services 251
## 15 Conservation associate professionals 12
## 16 Construction operatives 30
## 17 Construction trades 142
## 18 Corporate managers and senior officials 44
## 19 Customer service occupations 111
## 20 Design associate professionals 56
## 21 Draughtspersons and building inspectors 19
## 22 Electrical trades 84
## 23 Elementary administration occupations 76
## 24 Elementary agricultural occupations 18
## 25 Elementary cleaning occupations 216
## 26 Elementary construction occupations 37
## 27 Elementary goods storage occupations 81
## 28 Elementary personal services occupations 138
## 29 Elementary process plant occupations 49
## 30 Elementary sales occupations 29
## 31 Elementary security occupations 97
## 32 Engineering professionals 108
## 33 Financial institution and office managers 152
## 34 Food preparation trades 86
## 35 Functional managers 386
## 36 Hairdressers and related occupations 36
## 37 Health and social services managers 70
## 38 Health associate professionals 307
## 39 Health professionals 141
## 40 Healthcare and related personal services 449
## 41 Housekeeping occupations 41
## 42 IT service delivery occupations 81
## 43 Information and communication technology professionals 149
## 44 Legal associate professionals 27
## 45 Legal professionals 61
## 46 Leisure and travel service occupations 45
## 47 Librarians and related professionals 16
## 48 Managers and proprietors in hospitality and leisure services 69
## 49 Managers and proprietors in other service industries 322
## 50 Managers in distribution, storage and retailing 121
## 51 Managers in farming, horticulture, forestry and services 19
## 52 Media associate professionals 57
## 53 Metal forming, welding and related trades 18
## 54 Metal machining, fitting and instrument making trades 91
## 55 Mobile machine drivers and operatives 28
## 56 Personal services occupations nec 14
## 57 Plant and machine operatives 56
## 58 Printing trades 12
## 59 Process operatives 46
## 60 Production managers 127
## 61 Protective service occupations 81
## 62 Protective service officers 11
## 63 Public service and other associate professionals 173
## 64 Public service professionals 86
## 65 Quality and customer care managers 53
## 66 Research professionals 54
## 67 Sales and related associate professionals 136
## 68 Sales assistants and retail cashiers 308
## 69 Sales related occupations 40
## 70 Science and engineering technicians 85
## 71 Science professionals 33
## 72 Secretarial and related occupations 204
## 73 Skilled trades nec 26
## 74 Social welfare associate professionals 123
## 75 Sports and fitness occupations 31
## 76 Teaching professionals 481
## 77 Textiles and garments trades 17
## 78 Therapists 87
## 79 Transport associate professionals 25
## 80 Transport drivers and operatives 264
## 81 Vehicle trades 36
## 82 don't know 13
## 83 inapplicable 10154
## 84 missing 364
## 85 refusal 49
## percentage
## 1 0.05252101
## 2 1.25525210
## 3 1.29201681
## 4 0.54621849
## 5 0.84558824
## 6 0.53571429
## 7 0.13130252
## 8 0.23109244
## 9 0.34138655
## 10 0.42016807
## 11 0.17857143
## 12 1.37079832
## 13 0.57773109
## 14 1.31827731
## 15 0.06302521
## 16 0.15756303
## 17 0.74579832
## 18 0.23109244
## 19 0.58298319
## 20 0.29411765
## 21 0.09978992
## 22 0.44117647
## 23 0.39915966
## 24 0.09453782
## 25 1.13445378
## 26 0.19432773
## 27 0.42542017
## 28 0.72478992
## 29 0.25735294
## 30 0.15231092
## 31 0.50945378
## 32 0.56722689
## 33 0.79831933
## 34 0.45168067
## 35 2.02731092
## 36 0.18907563
## 37 0.36764706
## 38 1.61239496
## 39 0.74054622
## 40 2.35819328
## 41 0.21533613
## 42 0.42542017
## 43 0.78256303
## 44 0.14180672
## 45 0.32037815
## 46 0.23634454
## 47 0.08403361
## 48 0.36239496
## 49 1.69117647
## 50 0.63550420
## 51 0.09978992
## 52 0.29936975
## 53 0.09453782
## 54 0.47794118
## 55 0.14705882
## 56 0.07352941
## 57 0.29411765
## 58 0.06302521
## 59 0.24159664
## 60 0.66701681
## 61 0.42542017
## 62 0.05777311
## 63 0.90861345
## 64 0.45168067
## 65 0.27836134
## 66 0.28361345
## 67 0.71428571
## 68 1.61764706
## 69 0.21008403
## 70 0.44642857
## 71 0.17331933
## 72 1.07142857
## 73 0.13655462
## 74 0.64600840
## 75 0.16281513
## 76 2.52626050
## 77 0.08928571
## 78 0.45693277
## 79 0.13130252
## 80 1.38655462
## 81 0.18907563
## 82 0.06827731
## 83 53.32983193
## 84 1.91176471
## 85 0.25735294
# Filter out the 'inapplicable' category
filtered_data <- data %>%
filter(jkl_jbsoc00_cc != "inapplicable")
# Calculate percentages for the employment status
job <- filtered_data %>%
count(jkl_jbsoc00_cc) %>%
mutate(percentage = n / sum(n) * 100)
# Filter the top 20 categories by percentage
top_20_job <- job %>%
arrange(desc(percentage)) %>%
slice_head(n = 20)
# Plot the top 20 categories
top_20_plot <- ggplot(top_20_job, aes(x = reorder(jkl_jbsoc00_cc, percentage), y = percentage)) +
geom_bar(stat = "identity", fill = 'blue') +
labs(title = 'Top 20 Employment Status Categories', x = 'Employment Status', y = 'Percentage') +
coord_flip() +
theme_minimal()
# Print the plot
print(top_20_plot)
The largest group of respondents falls under “Lower management & professional,” comprising just under 30%. “Semi-routine,” “Intermediate,” “Small employers & own account,” “Higher professional,” and “Routine” categories each account for a smaller but still significant proportion of the respondents.”Lower supervisory & technical,” “Large employers & higher management,” and “Missing” categories represent the smallest percentages, each below 10%. This distribution highlights the predominance of lower management and professional roles among respondents, with a diverse spread across other employment statuses.
# Filter out the 'inapplicable' category
filtered_data1 <- data %>%
filter(jkl_jbnssec8_dv != "inapplicable")
# Calculate percentages for the employment status
sec <- filtered_data1 %>%
count(jkl_jbnssec8_dv) %>%
mutate(percentage = n / sum(n) * 100)
# Check the 'sec' data frame
print("SEC data frame after calculating percentages:")
## [1] "SEC data frame after calculating percentages:"
print(sec)
## jkl_jbnssec8_dv n percentage
## 1 Higher professional 829 9.322987
## 2 Intermediate 1179 13.259109
## 3 Large employers & higher management 442 4.970760
## 4 Lower management & professional 2652 29.824561
## 5 Lower supervisory & technical 514 5.780477
## 6 Routine 710 7.984705
## 7 Semi-routine 1291 14.518668
## 8 Small employers & own account 835 9.390463
## 9 missing 440 4.948268
# Plot the categories
sec_plot <- ggplot(sec, aes(x = reorder(jkl_jbnssec8_dv, percentage), y = percentage)) +
geom_bar(stat = "identity", fill = 'blue') +
labs(
title = "Classification of an individual's current job based on the National Statistics Socio-economic Classification (NS-SEC) eight-class model",
x = 'Employment Status',
y = 'Percentage'
) +
coord_flip() +
theme_minimal()
# Print the plot
print(sec_plot)
When looking at the “Classification of an individual’s current job based on the National Statistics Socio-economic Classification (NS-SEC) eight-class model”, the largest group of respondents have “Management & professional” as their employment status. They comprise of just under 45% of the total respondents. “Semi-routine & Routine” represents just above 20% of the population. “Intermediate” accounts for just below 15 % of the total population. “Small employers & own account” represents close to 10% of the population. Whereas “Lower supervisory & technical,” and the “Missing” categories represent the smallest percentages, each below 10%. This distribution indicates that a substantial portion of the respondents occupy lower management and professional roles, while fewer are in higher management or supervisory positions.
# Filter out the 'inapplicable' category
filtered_data2 <- data %>%
filter(jkl_jbnssec5_dv != "inapplicable")
# Calculate percentages for the employment status
sec5 <- filtered_data2 %>%
count(jkl_jbnssec5_dv) %>%
mutate(percentage = n / sum(n) * 100)
# Check the 'sec' data frame
print("Simple SEC data frame after calculating percentages:")
## [1] "Simple SEC data frame after calculating percentages:"
print(sec5)
## jkl_jbnssec5_dv n percentage
## 1 Intermediate 1179 13.259109
## 2 Lower supervisory & technical 514 5.780477
## 3 Management & professional 3923 44.118309
## 4 Semi-routine & routine 2001 22.503374
## 5 Small employers & own account 835 9.390463
## 6 missing 440 4.948268
# Plot the categories
sec_plot5 <- ggplot(sec5, aes(x = reorder(jkl_jbnssec5_dv, percentage), y = percentage)) +
geom_bar(stat = "identity", fill = 'blue') +
labs(
title = "Classification of an individual's current job based on the National Statistics Socio-economic Classification (NS-SEC) five-class model",
x = 'Employment Status',
y = 'Percentage'
) +
coord_flip() +
theme_minimal()
# Print the plot
print(sec_plot5)
Based on the National Statistics Socio-economic Classification (NS-SEC) five-class model, most respondents’ current jobs are categorized as “Lower management & professional,” making up nearly 30% of the total. This is followed by “Semi-routine” and “Routine” jobs, which together account for just over 20%. The “Intermediate” category includes just under 15% of the respondents. About 10% of the population falls under “Small employers & own account.” The “Lower supervisory & technical” and “Missing” categories have the smallest representation, each below 10%. This indicates a significant concentration of respondents in professional and management roles, with fewer in supervisory or technical positions.
# Filter out the 'inapplicable' category
filtered_data3 <- data %>%
filter(jkl_jbnssec3_dv != "inapplicable")
# Calculate percentages for the employment status
sec3 <- filtered_data3 %>%
count(jkl_jbnssec3_dv) %>%
mutate(percentage = n / sum(n) * 100)
# Check the 'sec' data frame
print("Simple SEC data frame after calculating percentages:")
## [1] "Simple SEC data frame after calculating percentages:"
print(sec3)
## jkl_jbnssec3_dv n percentage
## 1 Intermediate 2014 22.649573
## 2 Management & professional 3923 44.118309
## 3 Routine 2515 28.283851
## 4 missing 440 4.948268
# Plot the categories
sec_plot3 <- ggplot(sec3, aes(x = reorder(jkl_jbnssec3_dv, percentage), y = percentage)) +
geom_bar(stat = "identity", fill = 'blue') +
labs(
title = "Classification of an individual's current job based on the National Statistics Socio-economic Classification (NS-SEC) three-class model",
x = 'Employment Status',
y = 'Percentage'
) +
coord_flip() +
theme_minimal()
# Print the plot
print(sec_plot3)
The National Statistics Socio-economic Classification (NS-SEC) three-class model reveals that nearly 45% of the population falls under the “Management & professional” category, making it the largest group. Approximately 30% of respondents are in the “Routine” category, while just over 20% are classified as “Intermediate.” The “Missing” category accounts for a small 5% of the population. This distribution highlights a significant concentration of individuals in management and professional roles, with a substantial portion also engaged in routine jobs.
names(data)
## [1] "pidp" "jkl_hidp" "j_hidp"
## [4] "k_hidp" "l_hidp" "jkl_pno"
## [7] "jkl_mnpno" "jkl_fnpno" "jkl_mnpid"
## [10] "jkl_fnpid" "jkl_mnspno" "jkl_fnspno"
## [13] "jkl_mnspid" "jkl_fnspid" "jkl_grmpno"
## [16] "jkl_grfpno" "jkl_childpno" "jkl_country"
## [19] "jkl_gor_dv" "jkl_urban_dv" "jkl_sex_dv"
## [22] "jkl_mastat_dv" "jkl_jbstat" "jkl_ethn_dv"
## [25] "bornuk_dv" "jkl_jbsoc00_cc" "jkl_jbnssec8_dv"
## [28] "jkl_jbnssec5_dv" "jkl_jbnssec3_dv" "jkl_fimnnet_dv"
## [31] "jkl_health" "jkl_nkids_dv" "jkl_hhtype_dv"
## [34] "jkl_tenure_dv" "jkl_fihhmnnet1_dv"
The variables in this dataset cover a wide range of demographic, socio-economic, and health-related factors.
Variables that showcase demographics include Household Identifiers (jkl_hidp, j_hidp, k_hidp, l_hidp), Country (jkl_country) & Region (jkl_gor_dv) variables and Ethnic Group (jkl_ethn_dv)
The Household Identifiers (jkl_hidp, j_hidp, k_hidp, l_hidp) are crucial for distinguishing between different households and linking data across different waves. Understanding household dynamics is essential for economic analyses, such as household income distribution and the impact of economic policies on households. By linking personal and household identifiers (pidp, jkl_hidp, j_hidp, k_hidp, l_hidp), researchers can track individual and household economic outcomes over time. This relationship is essential for longitudinal studies to understand how personal and household circumstances change and the impact of economic policies over different periods.
# Check uniqueness of personal identifiers
unique_pidp <- length(unique(data$pidp)) == nrow(data)
cat("Are all pidp values unique? ", unique_pidp, "\n")
## Are all pidp values unique? FALSE
# Check consistency of household identifiers
unique_households <- data %>%
select(jkl_hidp, j_hidp, k_hidp, l_hidp) %>%
unique()
cat("Number of unique households: ", nrow(unique_households), "\n")
## Number of unique households: 13242
# Count the number of individuals per household
household_size <- data %>%
group_by(jkl_hidp) %>%
summarise(household_count = n())
# Summary statistics of household sizes
summary(household_size$household_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.438 2.000 7.000
# Create a frequency table for household sizes
household_size_freq <- household_size %>%
count(household_count) %>%
rename(frequency = n)
# Visualize household sizes with a line chart
ggplot(household_size_freq, aes(x = household_count, y = frequency)) +
geom_line() +
geom_point() +
labs(title = "Distribution of Household Sizes", x = "Household Size", y = "Frequency")
The chart above reveals that the most common household size is 2 members, with a frequency just over 6000, followed by 3-member households at slightly above 4000. One-member households also have a significant frequency, though less than 3-member households. Household sizes larger than 3 are much less common, with frequencies quickly tapering off, and larger household sizes (4, 5, 6, etc.) have very low frequencies compared to smaller household sizes. These observations imply a greater demand for smaller housing units and tailored public services to accommodate the predominant household sizes.
The Country (jkl_country) and Region (jkl_gor_dv) variables help to analyze geographic disparities in economic conditions. Different regions may have varying levels of economic development, employment opportunities, and living standards, which are critical for regional policy-making and targeted economic interventions.
The Urban and rural areas that is showcased by (jkl_urban_dv) often have different economic structures. Urban areas might have more diverse job opportunities and higher living costs, while rural areas might face challenges like limited access to services and employment. This variable is essential for understanding these differences and tailoring economic policies accordingly.
The Sex (derived) (jkl_sex_dv) variable highlights economic disparities between genders. Women often earn less than men due to the gender wage gap and face discrimination in hiring and promotions. They have less access to high-paying STEM fields and professional networks. Additionally, family responsibilities often lead to career interruptions and part-time work for women. Addressing these issues through targeted policies and workplace reforms is crucial for promoting gender equality and reducing economic disparities.
Ethnic Group (jkl_ethn_dv) can influence economic opportunities and outcomes due to factors like discrimination, social networks, and access to education. This variable is important for understanding and addressing economic disparities among different ethnic groups.
Employment Status (jkl_jbstat) is directly linked to income levels, economic stability, and overall well-being. Analyzing employment status can provide insights into labor market trends, unemployment rates, and the effectiveness of job creation programs.
The plots below showcase how employment status varies accross different demographics represented in the study.
key_variable <- "jkl_jbstat"
variables_to_pair <- c("jkl_country", "jkl_urban_dv", "jkl_sex_dv", "jkl_ethn_dv")
for (var in variables_to_pair){count_table <- table(data_factors[[key_variable]], data_factors[[var]])
prop_table <- prop.table(count_table, 1)
prop_df <- as.data.frame(prop_table)
names(prop_df) <- c("KeyVariable", "PairedVariable", "Proportion")
plot <- ggplot(prop_df, aes(x = KeyVariable, y = Proportion, fill = PairedVariable)) +
geom_bar(stat = "identity", position = "fill") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
labs(title = paste("Proportion of", var, "within", key_variable),
x = key_variable, y = "Proportion") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot)}
# Calculate the proportions
data_proportion <- data %>%
group_by(jkl_gor_dv, jkl_jbstat) %>%
summarise(count = n()) %>%
mutate(proportion = count / sum(count))
## `summarise()` has grouped output by 'jkl_gor_dv'. You can override using the
## `.groups` argument.
# Create the stacked bar chart
ggplot(data_proportion, aes(x = jkl_gor_dv, y = proportion, fill = jkl_jbstat)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal() +
labs(
title = "Proportion of Employment Status within Regions in the UK",
x = "Region",
y = "Proportion",
fill = "Employment Status"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Change "Yorkshire and the Humber" to "Yorkshire and The Humber" to match the map
data <- data %>%
mutate(jkl_gor_dv = ifelse(jkl_gor_dv == "Yorkshire and the Humber", "Yorkshire and The Humber", jkl_gor_dv))
# Define the path to the shapefile
shapefile_path <- "/Users/mellogwayo/Desktop/Household Financial Health Analytics Platform/data/processed/RGN_DEC_2021_EN_BGC.shp"
# Read the shapefile
uk_shapefile <- st_read(shapefile_path)
## Reading layer `RGN_DEC_2021_EN_BGC' from data source
## `/Users/mellogwayo/Desktop/Household Financial Health Analytics Platform/data/processed/RGN_DEC_2021_EN_BGC.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 9 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 82668.52 ymin: 5352.6 xmax: 655653.8 ymax: 657540.4
## Projected CRS: OSGB36 / British National Grid
# Simplify the geometry
uk_shapefile_simplified <- ms_simplify(uk_shapefile, keep = 0.01)
# Unique values in the shapefile
cat("Unique values in shapefile (RGN21NM):\n")
## Unique values in shapefile (RGN21NM):
print(unique(uk_shapefile_simplified$RGN21NM))
## [1] "North East" "North West"
## [3] "Yorkshire and The Humber" "East Midlands"
## [5] "West Midlands" "East of England"
## [7] "London" "South East"
## [9] "South West"
# Unique values in your data
cat("Unique values in data (jkl_gor_dv):\n")
## Unique values in data (jkl_gor_dv):
print(unique(data$jkl_gor_dv))
## [1] "North East" "West Midlands"
## [3] "North West" "Yorkshire and The Humber"
## [5] "Scotland" "East Midlands"
## [7] "London" "South East"
## [9] "South West" "East of England"
## [11] "Wales" "Northern Ireland"
## [13] "missing"
# Inspect column names in the shapefile
cat("Column names in the shapefile:\n")
## Column names in the shapefile:
print(names(uk_shapefile_simplified))
## [1] "RGN21CD" "BNG_N" "RGN21NM" "BNG_E" "LONG" "LAT" "GlobalID"
## [8] "geometry"
# Inspect column names in your data
cat("Column names in your data:\n")
## Column names in your data:
print(names(data))
## [1] "pidp" "jkl_hidp" "j_hidp"
## [4] "k_hidp" "l_hidp" "jkl_pno"
## [7] "jkl_mnpno" "jkl_fnpno" "jkl_mnpid"
## [10] "jkl_fnpid" "jkl_mnspno" "jkl_fnspno"
## [13] "jkl_mnspid" "jkl_fnspid" "jkl_grmpno"
## [16] "jkl_grfpno" "jkl_childpno" "jkl_country"
## [19] "jkl_gor_dv" "jkl_urban_dv" "jkl_sex_dv"
## [22] "jkl_mastat_dv" "jkl_jbstat" "jkl_ethn_dv"
## [25] "bornuk_dv" "jkl_jbsoc00_cc" "jkl_jbnssec8_dv"
## [28] "jkl_jbnssec5_dv" "jkl_jbnssec3_dv" "jkl_fimnnet_dv"
## [31] "jkl_health" "jkl_nkids_dv" "jkl_hhtype_dv"
## [34] "jkl_tenure_dv" "jkl_fihhmnnet1_dv"
# Define the regions present in the shapefile
regions_in_shapefile <- unique(uk_shapefile_simplified$RGN21NM)
# Filter the data to include only the regions present in the shapefile
filtered_data <- data %>%
filter(jkl_gor_dv %in% regions_in_shapefile)
# Merge simplified shapefile with the filtered data
uk_data_filtered <- uk_shapefile_simplified %>%
left_join(filtered_data, by = c("RGN21NM" = "jkl_gor_dv"))
# Plot the map
ggplot(data = uk_data_filtered) +
geom_sf(aes(fill = jkl_jbstat)) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Employment Status in Different Regions in the UK",
fill = "Employment Status"
)
# Summarize the data to get proportions
data_summary <- data %>%
group_by(jkl_jbstat, jkl_ethn_dv) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(proportion = count / sum(count))
#Long format for pie chart
data_long <- data_summary %>%
select(jkl_jbstat, jkl_ethn_dv, proportion)
# Get unique categories of jkl_jbstat
categories <- unique(data_long$jkl_jbstat)
# Generate and print pie charts for each category
for (category in categories) {
plot_data <- data_long %>% filter(jkl_jbstat == category)
p <- ggplot(plot_data, aes(x = "", y = proportion, fill = jkl_ethn_dv)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0) +
theme_minimal(base_size = 15) +
labs(title = paste("Proportion of jkl_ethn_dv within", category),
fill = "Ethnic Group",
y = "Proportion",
x = "") +
theme(
axis.text.x = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(size = 16)
)
# Print the plot
print(p)
}
Standard Socio-economic Classification (SOC) (jkl_jbsoc00_cc, jkl_jbnssec8_dv, jkl_jbnssec5_dv, jkl_jbnssec3_dv) provide a detailed look at occupational status and social class. They are crucial for studying social mobility, occupational earnings, and the impact of economic policies on different socio-economic groups.
key_variables <- c("jkl_jbnssec8_dv", "jkl_jbnssec5_dv", "jkl_jbnssec3_dv")
variables_to_pair <- c("jkl_country", "jkl_urban_dv", "jkl_sex_dv", "jkl_ethn_dv")
for (key_variable in key_variables) {
for (var in variables_to_pair) {
count_table <- table(data_factors[[key_variable]], data_factors[[var]])
prop_table <- prop.table(count_table, 1)
prop_df <- as.data.frame(prop_table)
names(prop_df) <- c("KeyVariable", "PairedVariable", "Proportion")
plot <- ggplot(prop_df, aes(x = KeyVariable, y = Proportion, fill = PairedVariable)) +
geom_bar(stat = "identity", position = "fill") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
labs(title = paste("Proportion of", var, "within", key_variable),
x = key_variable, y = "Proportion") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot)
}
}
# List of key variables
key_variables <- c("jkl_jbnssec8_dv", "jkl_jbnssec5_dv", "jkl_jbnssec3_dv")
# Loop over each key variable
for (key_variable in key_variables) {
# Filter out 'inapplicable' category and calculate the proportions
data_proportion <- data %>%
filter(!!sym(key_variable) != "inapplicable") %>%
group_by(jkl_gor_dv, !!sym(key_variable)) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(proportion = count / sum(count))
# Create the stacked bar chart
plot <- ggplot(data_proportion, aes(x = jkl_gor_dv, y = proportion, fill = !!sym(key_variable))) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal() +
labs(
title = paste("Proportion of", key_variable, "within Regions in the UK"),
x = "Region",
y = "Proportion",
fill = key_variable
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(plot)
}
# List of key variables
key_variables <- c("jkl_jbnssec8_dv", "jkl_jbnssec5_dv", "jkl_jbnssec3_dv")
# Print unique values and column names for debugging
cat("Unique values in shapefile (RGN21NM):\n")
## Unique values in shapefile (RGN21NM):
print(unique(uk_shapefile_simplified$RGN21NM))
## [1] "North East" "North West"
## [3] "Yorkshire and The Humber" "East Midlands"
## [5] "West Midlands" "East of England"
## [7] "London" "South East"
## [9] "South West"
cat("Unique values in data (jkl_gor_dv):\n")
## Unique values in data (jkl_gor_dv):
print(unique(data$jkl_gor_dv))
## [1] "North East" "West Midlands"
## [3] "North West" "Yorkshire and The Humber"
## [5] "Scotland" "East Midlands"
## [7] "London" "South East"
## [9] "South West" "East of England"
## [11] "Wales" "Northern Ireland"
## [13] "missing"
cat("Column names in the shapefile:\n")
## Column names in the shapefile:
print(names(uk_shapefile_simplified))
## [1] "RGN21CD" "BNG_N" "RGN21NM" "BNG_E" "LONG" "LAT" "GlobalID"
## [8] "geometry"
cat("Column names in your data:\n")
## Column names in your data:
print(names(data))
## [1] "pidp" "jkl_hidp" "j_hidp"
## [4] "k_hidp" "l_hidp" "jkl_pno"
## [7] "jkl_mnpno" "jkl_fnpno" "jkl_mnpid"
## [10] "jkl_fnpid" "jkl_mnspno" "jkl_fnspno"
## [13] "jkl_mnspid" "jkl_fnspid" "jkl_grmpno"
## [16] "jkl_grfpno" "jkl_childpno" "jkl_country"
## [19] "jkl_gor_dv" "jkl_urban_dv" "jkl_sex_dv"
## [22] "jkl_mastat_dv" "jkl_jbstat" "jkl_ethn_dv"
## [25] "bornuk_dv" "jkl_jbsoc00_cc" "jkl_jbnssec8_dv"
## [28] "jkl_jbnssec5_dv" "jkl_jbnssec3_dv" "jkl_fimnnet_dv"
## [31] "jkl_health" "jkl_nkids_dv" "jkl_hhtype_dv"
## [34] "jkl_tenure_dv" "jkl_fihhmnnet1_dv"
# Define the regions present in the shapefile
regions_in_shapefile <- unique(uk_shapefile_simplified$RGN21NM)
# Filter the data to include only the regions present in the shapefile
filtered_data <- data %>%
filter(jkl_gor_dv %in% regions_in_shapefile)
# Loop over each key variable
for (key_variable in key_variables) {
# Remove 'inapplicable' category
filtered_data_variable <- filtered_data %>%
filter(!!sym(key_variable) != "inapplicable")
# Merge simplified shapefile with the filtered data for the current key variable
uk_data_filtered <- uk_shapefile_simplified %>%
left_join(filtered_data_variable, by = c("RGN21NM" = "jkl_gor_dv"))
# Plot the map for the current key variable
plot <- ggplot(data = uk_data_filtered) +
geom_sf(aes_string(fill = key_variable)) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = paste("Proportion of", key_variable, "in Different Regions in the UK"),
fill = key_variable
)
# Print the plot
print(plot)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# List of key variables for pie charts
key_variables <- c("jkl_jbnssec8_dv", "jkl_jbnssec5_dv", "jkl_jbnssec3_dv")
# Loop over each key variable
for (key_variable in key_variables) {
# Summarize the data to get proportions
data_summary <- data %>%
group_by(!!sym(key_variable), jkl_ethn_dv) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(proportion = count / sum(count))
# Long format for pie chart
data_long <- data_summary %>%
select(!!sym(key_variable), jkl_ethn_dv, proportion)
# Get unique categories of the current key variable
categories <- unique(data_long[[key_variable]])
# Generate and print pie charts for each category
for (category in categories) {
plot_data <- data_long %>% filter((!!sym(key_variable)) == category)
p <- ggplot(plot_data, aes(x = "", y = proportion, fill = jkl_ethn_dv)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0) +
theme_minimal(base_size = 15) +
labs(
title = paste("Proportion of jkl_ethn_dv within", category, "of", key_variable),
fill = "Ethnic Group",
y = "Proportion",
x = ""
) +
theme(
axis.text.x = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(size = 16)
)
# Print the plot
print(p)
}
}
Health and Well-being (jkl_health) show how health status significantly affects economic productivity and quality of life. Poor health can lead to increased healthcare costs and reduced labor market participation. These variables are essential for understanding the economic impact of health and designing health-related economic policies.
key_variable <- "jkl_health"
variables_to_pair <- c("jkl_country", "jkl_urban_dv", "jkl_sex_dv", "jkl_ethn_dv")
for (var in variables_to_pair) {
count_table <- table(data_factors[[key_variable]], data_factors[[var]])
prop_table <- prop.table(count_table, 1)
prop_df <- as.data.frame(prop_table)
names(prop_df) <- c("KeyVariable", "PairedVariable", "Proportion")
plot <- ggplot(prop_df, aes(x = KeyVariable, y = Proportion, fill = PairedVariable)) +
geom_bar(stat = "identity", position = "fill") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
labs(title = paste("Proportion of", var, "within", key_variable),
x = key_variable, y = "Proportion") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot)
}
# List of key variables
key_variables <- "jkl_health"
# Loop over each key variable
for (key_variable in key_variables) {
# Filter out 'inapplicable' category and calculate the proportions
data_proportion <- data %>%
filter(!!sym(key_variable) != "inapplicable") %>%
group_by(jkl_gor_dv, !!sym(key_variable)) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(proportion = count / sum(count))
# Create the stacked bar chart
plot <- ggplot(data_proportion, aes(x = jkl_gor_dv, y = proportion, fill = !!sym(key_variable))) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal() +
labs(
title = paste("Proportion of", key_variable, "within Regions in the UK"),
x = "Region",
y = "Proportion",
fill = key_variable
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(plot)
}
# List of key variables
key_variables <- "jkl_health"
# Print unique values and column names for debugging
cat("Unique values in shapefile (RGN21NM):\n")
## Unique values in shapefile (RGN21NM):
print(unique(uk_shapefile_simplified$RGN21NM))
## [1] "North East" "North West"
## [3] "Yorkshire and The Humber" "East Midlands"
## [5] "West Midlands" "East of England"
## [7] "London" "South East"
## [9] "South West"
cat("Unique values in data (jkl_gor_dv):\n")
## Unique values in data (jkl_gor_dv):
print(unique(data$jkl_gor_dv))
## [1] "North East" "West Midlands"
## [3] "North West" "Yorkshire and The Humber"
## [5] "Scotland" "East Midlands"
## [7] "London" "South East"
## [9] "South West" "East of England"
## [11] "Wales" "Northern Ireland"
## [13] "missing"
cat("Column names in the shapefile:\n")
## Column names in the shapefile:
print(names(uk_shapefile_simplified))
## [1] "RGN21CD" "BNG_N" "RGN21NM" "BNG_E" "LONG" "LAT" "GlobalID"
## [8] "geometry"
cat("Column names in your data:\n")
## Column names in your data:
print(names(data))
## [1] "pidp" "jkl_hidp" "j_hidp"
## [4] "k_hidp" "l_hidp" "jkl_pno"
## [7] "jkl_mnpno" "jkl_fnpno" "jkl_mnpid"
## [10] "jkl_fnpid" "jkl_mnspno" "jkl_fnspno"
## [13] "jkl_mnspid" "jkl_fnspid" "jkl_grmpno"
## [16] "jkl_grfpno" "jkl_childpno" "jkl_country"
## [19] "jkl_gor_dv" "jkl_urban_dv" "jkl_sex_dv"
## [22] "jkl_mastat_dv" "jkl_jbstat" "jkl_ethn_dv"
## [25] "bornuk_dv" "jkl_jbsoc00_cc" "jkl_jbnssec8_dv"
## [28] "jkl_jbnssec5_dv" "jkl_jbnssec3_dv" "jkl_fimnnet_dv"
## [31] "jkl_health" "jkl_nkids_dv" "jkl_hhtype_dv"
## [34] "jkl_tenure_dv" "jkl_fihhmnnet1_dv"
# Define the regions present in the shapefile
regions_in_shapefile <- unique(uk_shapefile_simplified$RGN21NM)
# Filter the data to include only the regions present in the shapefile
filtered_data <- data %>%
filter(jkl_gor_dv %in% regions_in_shapefile)
# Loop over each key variable
for (key_variable in key_variables) {
# Remove 'inapplicable' category
filtered_data_variable <- filtered_data %>%
filter(!!sym(key_variable) != "inapplicable")
# Merge simplified shapefile with the filtered data for the current key variable
uk_data_filtered <- uk_shapefile_simplified %>%
left_join(filtered_data_variable, by = c("RGN21NM" = "jkl_gor_dv"))
# Plot the map for the current key variable
plot <- ggplot(data = uk_data_filtered) +
geom_sf(aes_string(fill = key_variable)) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = paste("Proportion of", key_variable, "in Different Regions in the UK"),
fill = key_variable
)
# Print the plot
print(plot)
}
# List of key variables for pie charts
key_variables <- "jkl_health"
# Loop over each key variable
for (key_variable in key_variables) {
# Summarize the data to get proportions
data_summary <- data %>%
group_by(!!sym(key_variable), jkl_ethn_dv) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(proportion = count / sum(count))
# Long format for pie chart
data_long <- data_summary %>%
select(!!sym(key_variable), jkl_ethn_dv, proportion)
# Get unique categories of the current key variable
categories <- unique(data_long[[key_variable]])
# Generate and print pie charts for each category
for (category in categories) {
plot_data <- data_long %>% filter((!!sym(key_variable)) == category)
p <- ggplot(plot_data, aes(x = "", y = proportion, fill = jkl_ethn_dv)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0) +
theme_minimal(base_size = 15) +
labs(
title = paste("Proportion of jkl_ethn_dv within", category, "of", key_variable),
fill = "Ethnic Group",
y = "Proportion",
x = ""
) +
theme(
axis.text.x = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(size = 16)
)
# Print the plot
print(p)
}
}
Household Composition and Size (jkl_hhsize, jkl_nkids_dv, jkl_hhtype_dv) affects economic needs and consumption patterns. Larger households or those with more children may have different financial pressures compared to smaller households. These variables help in analyzing the economic needs of different household types.
Housing Tenure (jkl_tenure_dv), which show whether renting or owning affects household expenditure, wealth accumulation, and financial stability. This variable is important for studying housing market trends and the impact of housing policies.
key_variable <- "jkl_tenure_dv"
variables_to_pair <- c("jkl_country", "jkl_urban_dv", "jkl_sex_dv", "jkl_ethn_dv")
for (var in variables_to_pair) {
count_table <- table(data_factors[[key_variable]], data_factors[[var]])
prop_table <- prop.table(count_table, 1)
prop_df <- as.data.frame(prop_table)
names(prop_df) <- c("KeyVariable", "PairedVariable", "Proportion")
plot <- ggplot(prop_df, aes(x = KeyVariable, y = Proportion, fill = PairedVariable)) +
geom_bar(stat = "identity", position = "fill") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
labs(title = paste("Proportion of", var, "within", key_variable),
x = key_variable, y = "Proportion") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot)
}
# List of key variables
key_variables <- "jkl_tenure_dv"
# Loop over each key variable
for (key_variable in key_variables) {
# Filter out 'inapplicable' category and calculate the proportions
data_proportion <- data %>%
filter(!!sym(key_variable) != "inapplicable") %>%
group_by(jkl_gor_dv, !!sym(key_variable)) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(proportion = count / sum(count))
# Create the stacked bar chart
plot <- ggplot(data_proportion, aes(x = jkl_gor_dv, y = proportion, fill = !!sym(key_variable))) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal() +
labs(
title = paste("Proportion of", key_variable, "within Regions in the UK"),
x = "Region",
y = "Proportion",
fill = key_variable
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Print the plot
print(plot)
}
# List of key variables
key_variables <- "jkl_tenure_dv"
# Print unique values and column names for debugging
cat("Unique values in shapefile (RGN21NM):\n")
## Unique values in shapefile (RGN21NM):
print(unique(uk_shapefile_simplified$RGN21NM))
## [1] "North East" "North West"
## [3] "Yorkshire and The Humber" "East Midlands"
## [5] "West Midlands" "East of England"
## [7] "London" "South East"
## [9] "South West"
cat("Unique values in data (jkl_gor_dv):\n")
## Unique values in data (jkl_gor_dv):
print(unique(data$jkl_gor_dv))
## [1] "North East" "West Midlands"
## [3] "North West" "Yorkshire and The Humber"
## [5] "Scotland" "East Midlands"
## [7] "London" "South East"
## [9] "South West" "East of England"
## [11] "Wales" "Northern Ireland"
## [13] "missing"
cat("Column names in the shapefile:\n")
## Column names in the shapefile:
print(names(uk_shapefile_simplified))
## [1] "RGN21CD" "BNG_N" "RGN21NM" "BNG_E" "LONG" "LAT" "GlobalID"
## [8] "geometry"
cat("Column names in your data:\n")
## Column names in your data:
print(names(data))
## [1] "pidp" "jkl_hidp" "j_hidp"
## [4] "k_hidp" "l_hidp" "jkl_pno"
## [7] "jkl_mnpno" "jkl_fnpno" "jkl_mnpid"
## [10] "jkl_fnpid" "jkl_mnspno" "jkl_fnspno"
## [13] "jkl_mnspid" "jkl_fnspid" "jkl_grmpno"
## [16] "jkl_grfpno" "jkl_childpno" "jkl_country"
## [19] "jkl_gor_dv" "jkl_urban_dv" "jkl_sex_dv"
## [22] "jkl_mastat_dv" "jkl_jbstat" "jkl_ethn_dv"
## [25] "bornuk_dv" "jkl_jbsoc00_cc" "jkl_jbnssec8_dv"
## [28] "jkl_jbnssec5_dv" "jkl_jbnssec3_dv" "jkl_fimnnet_dv"
## [31] "jkl_health" "jkl_nkids_dv" "jkl_hhtype_dv"
## [34] "jkl_tenure_dv" "jkl_fihhmnnet1_dv"
# Define the regions present in the shapefile
regions_in_shapefile <- unique(uk_shapefile_simplified$RGN21NM)
# Filter the data to include only the regions present in the shapefile
filtered_data <- data %>%
filter(jkl_gor_dv %in% regions_in_shapefile)
# Loop over each key variable
for (key_variable in key_variables) {
# Remove 'inapplicable' category
filtered_data_variable <- filtered_data %>%
filter(!!sym(key_variable) != "inapplicable")
# Merge simplified shapefile with the filtered data for the current key variable
uk_data_filtered <- uk_shapefile_simplified %>%
left_join(filtered_data_variable, by = c("RGN21NM" = "jkl_gor_dv"))
# Plot the map for the current key variable
plot <- ggplot(data = uk_data_filtered) +
geom_sf(aes_string(fill = key_variable)) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = paste("Proportion of", key_variable, "in Different Regions in the UK"),
fill = key_variable
)
# Print the plot
print(plot)
}
# List of key variables for pie charts
key_variables <- "jkl_tenure_dv"
# Loop over each key variable
for (key_variable in key_variables) {
# Summarize the data to get proportions
data_summary <- data %>%
group_by(!!sym(key_variable), jkl_ethn_dv) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(proportion = count / sum(count))
#Long format for pie chart
data_long <- data_summary %>%
select(!!sym(key_variable), jkl_ethn_dv, proportion)
# Get unique categories of the current key variable
categories <- unique(data_long[[key_variable]])
# Generate and print pie charts for each category
for (category in categories) {
plot_data <- data_long %>% filter((!!sym(key_variable)) == category)
p <- ggplot(plot_data, aes(x = "", y = proportion, fill = jkl_ethn_dv)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0) +
theme_minimal(base_size = 15) +
labs(
title = paste("Proportion of jkl_ethn_dv within", category, "of", key_variable),
fill = "Ethnic Group",
y = "Proportion",
x = ""
) +
theme(
axis.text.x = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(size = 16)
)
# Print the plot
print(p)
}
}
Net Household Monthly Income (jkl_fihhmnnet1_dv) is a critical measure of economic well-being. It influences consumption, savings, and investment decisions. Analyzing household income helps in understanding economic inequality and the standard of living.
key_variable <- "jkl_fihhmnnet1_dv"
variables_to_pair <- c("jkl_country", "jkl_urban_dv", "jkl_sex_dv", "jkl_ethn_dv")
for (var in variables_to_pair) {
if (!(var %in% names(data_factors))) {
warning(paste("The variable", var, "is not present in data_factors. Skipping."))
next
}
plot <- ggplot(data_factors, aes_string(x = var, y = key_variable, fill = var)) +
geom_boxplot() +
geom_hline(yintercept = median(data_factors[[key_variable]], na.rm = TRUE), linetype = "dashed", color = "red") +
labs(title = paste("Distribution of", key_variable, "across", var),
x = var, y = "Net Household Monthly Income") +
ylim(0, 12500) + # Set the y-axis limit
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print plot to the R console
print(plot)
}
## Warning: Removed 182 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 182 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 182 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 182 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Define the numerical key variable
key_variable <- "jkl_fihhmnnet1_dv"
# Print unique values and column names for debugging
cat("Unique values in shapefile (RGN21NM):\n")
## Unique values in shapefile (RGN21NM):
print(unique(uk_shapefile_simplified$RGN21NM))
## [1] "North East" "North West"
## [3] "Yorkshire and The Humber" "East Midlands"
## [5] "West Midlands" "East of England"
## [7] "London" "South East"
## [9] "South West"
cat("Unique values in data (jkl_gor_dv):\n")
## Unique values in data (jkl_gor_dv):
print(unique(data$jkl_gor_dv))
## [1] "North East" "West Midlands"
## [3] "North West" "Yorkshire and The Humber"
## [5] "Scotland" "East Midlands"
## [7] "London" "South East"
## [9] "South West" "East of England"
## [11] "Wales" "Northern Ireland"
## [13] "missing"
cat("Column names in the shapefile:\n")
## Column names in the shapefile:
print(names(uk_shapefile_simplified))
## [1] "RGN21CD" "BNG_N" "RGN21NM" "BNG_E" "LONG" "LAT" "GlobalID"
## [8] "geometry"
cat("Column names in your data:\n")
## Column names in your data:
print(names(data))
## [1] "pidp" "jkl_hidp" "j_hidp"
## [4] "k_hidp" "l_hidp" "jkl_pno"
## [7] "jkl_mnpno" "jkl_fnpno" "jkl_mnpid"
## [10] "jkl_fnpid" "jkl_mnspno" "jkl_fnspno"
## [13] "jkl_mnspid" "jkl_fnspid" "jkl_grmpno"
## [16] "jkl_grfpno" "jkl_childpno" "jkl_country"
## [19] "jkl_gor_dv" "jkl_urban_dv" "jkl_sex_dv"
## [22] "jkl_mastat_dv" "jkl_jbstat" "jkl_ethn_dv"
## [25] "bornuk_dv" "jkl_jbsoc00_cc" "jkl_jbnssec8_dv"
## [28] "jkl_jbnssec5_dv" "jkl_jbnssec3_dv" "jkl_fimnnet_dv"
## [31] "jkl_health" "jkl_nkids_dv" "jkl_hhtype_dv"
## [34] "jkl_tenure_dv" "jkl_fihhmnnet1_dv"
# Define the regions present in the shapefile
regions_in_shapefile <- unique(uk_shapefile_simplified$RGN21NM)
# Filter the data to include only the regions present in the shapefile
filtered_data <- data %>%
filter(jkl_gor_dv %in% regions_in_shapefile)
# Bin the numerical variable into categories
filtered_data <- filtered_data %>%
mutate(income_bin = cut(
!!sym(key_variable),
breaks = c(-Inf, 1000, 2000, 3000, 4000, 5000, Inf),
labels = c("<=1000", "1001-2000", "2001-3000", "3001-4000", "4001-5000", ">5000"),
right = FALSE
))
# Summarize the data to get proportions
data_summary <- filtered_data %>%
group_by(jkl_gor_dv, income_bin) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(proportion = count / sum(count))
# Merge simplified shapefile with the filtered data
uk_data_filtered <- uk_shapefile_simplified %>%
left_join(data_summary, by = c("RGN21NM" = "jkl_gor_dv"))
# Plot the map for each income bin
for (bin in unique(uk_data_filtered$income_bin)) {
plot_data <- uk_data_filtered %>%
filter(income_bin == bin)
plot <- ggplot(data = plot_data) +
geom_sf(aes(fill = proportion)) +
scale_fill_viridis_c() +
theme_minimal() +
labs(
title = paste("Proportion of Income Bin", bin, "in Different Regions in the UK"),
fill = "Proportion"
)
# Print the plot
print(plot)
}
# Define the numerical key variable
key_variable <- "jkl_fihhmnnet1_dv"
# Bin the numerical variable into categories
data <- data %>%
mutate(income_bin = cut(
!!sym(key_variable),
breaks = c(-Inf, 1000, 2000, 3000, 4000, 5000, Inf),
labels = c("<=1000", "1001-2000", "2001-3000", "3001-4000", "4001-5000", ">5000"),
right = FALSE
))
# Summarize the data to get proportions
data_summary <- data %>%
group_by(income_bin, jkl_ethn_dv) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(proportion = count / sum(count))
#Long format for pie chart
data_long <- data_summary %>%
select(income_bin, jkl_ethn_dv, proportion)
# Get unique categories of the binned income
categories <- unique(data_long$income_bin)
# Generate and print pie charts for each category
for (category in categories) {
plot_data <- data_long %>% filter(income_bin == category)
p <- ggplot(plot_data, aes(x = "", y = proportion, fill = jkl_ethn_dv)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0) +
theme_minimal(base_size = 15) +
labs(
title = paste("Proportion of jkl_ethn_dv within Income Bin", category),
fill = "Ethnic Group",
y = "Proportion",
x = ""
) +
theme(
axis.text.x = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(size = 16)
)
# Print the plot
print(p)
}
Understanding the relationships between these variables can provide deeper insights into various economic phenomena. Therefore, it is crucial to understand the potential relationships and their economic implications.
Analyzing the relationships between Country and Region variables (jkl_country, jkl_gor_dv) with Employment Status (jkl_jbstat) and Income variables (jkl_fimnnet_dv, jkl_fimnlabnet_dv, jkl_fihhmnnet1_dv) can reveal geographic disparities in employment opportunities and income levels. For instance, some regions may have higher unemployment rates and lower income levels, indicating the need for regional economic development programs or targeted employment policies.
# Filter data for England
england_data <- data %>%
filter(jkl_country == "England")
# Divide the categories into two subsets
categories <- unique(england_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
england_data_1 <- england_data %>%
filter(jkl_jbstat %in% categories_1)
england_data_2 <- england_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(england_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in England (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 110000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(england_data_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in England (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
print(plot2)
# Calculate summary statistics
summary_statistics <- england_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 16 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 77 2857. 2712. 2103. 0 8.99e3 1128. 3850.
## 2 Family care or home 332 3645. 2939. 2949. 0 2.54e4 1761. 4724.
## 3 Full-time student 89 1942. 1633. 1610. 0 6.02e3 633. 2685.
## 4 Govt training scheme 1 2722 2722 NA 2722 2.72e3 2722 2722
## 5 LT sick or disabled 497 2302. 1869. 1802. 0 2.10e4 1302. 2871.
## 6 On apprenticeship 6 2502. 2921 1359. 650 4.12e3 1499. 3257.
## 7 On furlough 113 3289. 2870. 2279. 0 1.70e4 1783. 4135.
## 8 On maternity leave 8 4195. 2075. 5007. 876. 1.59e4 1595. 4214.
## 9 Paid employment(ft/pt) 5959 4048. 3600 2817. -3306. 1.04e5 2363. 5091.
## 10 Retired 6151 2947. 2461. 2053. 0 2.31e4 1622. 3659.
## 11 Self employed 1118 4142. 3245. 4620. -3306. 1.04e5 1839. 4995.
## 12 Temporarily laid off/sho… 15 2858. 2388. 2304. 80 7.98e3 1130. 3838.
## 13 Unemployed 437 2032. 1647. 1975. 0 1.99e4 740. 2766.
## 14 Unpaid, family business 10 3532. 3062. 2220. 1300 8.45e3 1990. 4430.
## 15 don't know 7 3237. 1138. 4587. 0 9.92e3 272. 5528.
## 16 refusal 16 6383. 3795. 7148. 24.9 1.99e4 1897. 8049.
# Filter data
Northern_Ireland_data <- data %>%
filter(jkl_country == "Northern Ireland")
# Divide the categories into two subsets
categories <- unique(Northern_Ireland_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
Northern_Ireland_data_1 <- Northern_Ireland_data %>%
filter(jkl_jbstat %in% categories_1)
Northern_Ireland_2 <- Northern_Ireland_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(Northern_Ireland_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Northern Ireland (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 30000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(Northern_Ireland_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Northern Ireland (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
print(plot2)
# Calculate summary statistics
summary_statistics <- Northern_Ireland_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 11 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 11 2722. 1900. 2145. 240 6968. 1077. 4167.
## 2 Family care or home 24 3026. 2564. 1704. 1136. 7957. 1798. 3821.
## 3 Full-time student 1 2173. 2173. NA 2173. 2173. 2173. 2173.
## 4 LT sick or disabled 88 2199. 1968. 1387. 0 9967. 1285. 2837.
## 5 On furlough 7 2995. 2757. 1205. 1427. 4626. 2184. 3896.
## 6 On maternity leave 1 1683. 1683. NA 1683. 1683. 1683. 1683.
## 7 Paid employment(ft/pt) 403 3789. 3362. 2217. 0 21825. 2174. 4957
## 8 Retired 504 2781. 2275. 2196. 0 23362. 1504. 3363.
## 9 Self employed 70 3585. 2799. 3269. -794. 21825. 1543. 4686.
## 10 Unemployed 36 1590. 1090. 1306. 0 4472. 636. 2301.
## 11 Unpaid, family business 1 2481. 2481. NA 2481. 2481. 2481. 2481.
# Filter data
Scotland_data <- data %>%
filter(jkl_country == "Scotland")
# Divide the categories into two subsets
categories <- unique(Scotland_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
Scotland_data_1 <- Scotland_data %>%
filter(jkl_jbstat %in% categories_1)
Scotland_2 <- Scotland_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(Scotland_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Scotland (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 40000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(Scotland_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Scotland (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
print(plot2)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
# Calculate summary statistics
summary_statistics <- Scotland_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 14 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 6 1740. 1635. 1430. 315. 4179. 673. 2150.
## 2 Family care or home 23 3156. 2951. 2070. 141. 8957 1799. 4138.
## 3 Full-time student 21 2886. 1680 4213. 155. 19036. 731. 2611.
## 4 Govt training scheme 1 1635. 1635. NA 1635. 1635. 1635. 1635.
## 5 LT sick or disabled 80 2400. 1995. 2639. 0 23136. 1218. 2858.
## 6 On furlough 16 2957. 3155. 1718. 200 7386. 1512. 3796.
## 7 On maternity leave 5 3688. 3104. 1659. 2102 6192. 2563. 4476.
## 8 Paid employment(ft/pt) 691 3709. 3312. 2409. 0 30812. 2204. 4727.
## 9 Retired 779 2740. 2271. 1951. 0 14223. 1501. 3318.
## 10 Self employed 109 3318. 2636. 2313. 0 10658. 1667. 4629.
## 11 Temporarily laid off/… 3 12389. 4464. 16007. 1891. 30812. 3178. 17638.
## 12 Unemployed 34 1660. 1239. 1181. 0 4844. 888. 2377.
## 13 Unpaid, family busine… 1 4710. 4710. NA 4710. 4710. 4710. 4710.
## 14 refusal 1 11364. 11364. NA 11364. 11364. 11364. 11364.
# Filter data
Wales_data <- data %>%
filter(jkl_country == "Wales")
# Divide the categories into two subsets
categories <- unique(Wales_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
Wales_data_1 <- Wales_data %>%
filter(jkl_jbstat %in% categories_1)
Wales_2 <- Wales_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(Wales_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Wales (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 40000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(Wales_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Wales (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
print(plot2)
# Calculate summary statistics
summary_statistics <- Wales_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 13 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 7 2918. 2712. 1770. 1007. 5780. 1619. 3843.
## 2 Family care or home 20 2817. 3062. 1393. 52.5 5923. 1873. 3531.
## 3 Full-time student 6 2454. 2325. 1543. 933. 4502. 1124. 3518.
## 4 LT sick or disabled 50 2777. 2332. 1415. 784 7087. 1821. 3584.
## 5 On furlough 10 2941. 2725. 1647. 558. 6757. 2605. 3420.
## 6 Paid employment(ft/pt) 472 3598. 3343. 1906. 498 12391. 2245. 4434.
## 7 Retired 605 2600. 2279. 1493. 0 8778. 1535. 3403.
## 8 Self employed 80 3483. 3251. 2141. 52.5 13255. 2119. 4286.
## 9 Temporarily laid off/shor… 1 3562. 3562. NA 3562. 3562. 3562. 3562.
## 10 Unemployed 29 1450. 1540. 988. 54.2 4094. 498 2028.
## 11 Unpaid, family business 1 4704. 4704. NA 4704. 4704. 4704. 4704.
## 12 don't know 1 1400 1400 NA 1400 1400 1400 1400
## 13 refusal 1 3951. 3951. NA 3951. 3951. 3951. 3951.
# Filter data
North_East_data <- data %>%
filter(jkl_gor_dv == "North East")
# Divide the categories into two subsets
categories <- unique(North_East_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
North_East_data_1 <- North_East_data %>%
filter(jkl_jbstat %in% categories_1)
North_East_2 <- North_East_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(North_East_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in North East (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 40000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(North_East_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in North East (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
print(plot2)
# Calculate summary statistics
summary_statistics <- North_East_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 10 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 5 4105. 3627. 3012. 2.5 e+0 7516. 2831. 6551.
## 2 Family care or home 19 3863. 3204. 3695. 5.42e+2 15189. 1848. 3937.
## 3 Full-time student 3 1831. 2176. 597. 1.14e+3 2176. 1659. 2176.
## 4 LT sick or disabled 32 1843. 1807. 764. 6.5 e+1 3204. 1501. 2324.
## 5 On furlough 4 2315. 2157. 950. 1.48e+3 3464 1563. 2908.
## 6 Paid employment(ft/pt) 273 3491. 3029. 2121. 2.71e+2 15189. 2140 4222.
## 7 Retired 331 2480. 2267. 1272. 1.67e+1 7942. 1565. 3113.
## 8 Self employed 40 3446. 3078. 1513. 0 8022. 2717. 4474.
## 9 Temporarily laid off/sho… 1 1948. 1948. NA 1.95e+3 1948. 1948. 1948.
## 10 Unemployed 20 1737. 1485. 1319. 1.70e-1 5316. 775. 2234.
# Filter data
West_Midlands_data <- data %>%
filter(jkl_gor_dv == "West Midlands")
# Divide the categories into two subsets
categories <- unique(West_Midlands_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
West_Midlands_data_1 <- West_Midlands_data %>%
filter(jkl_jbstat %in% categories_1)
West_Midlands_2 <- West_Midlands_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(West_Midlands_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in West Midlands (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(West_Midlands_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in West Midlands (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
print(plot2)
# Calculate summary statistics
summary_statistics <- West_Midlands_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 11 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 10 2997. 2390. 2321. 172. 6367. 1320. 5146.
## 2 Family care or home 34 3719. 3147. 2079. 640 8374. 2014. 4834.
## 3 Full-time student 11 1163. 521. 1592. 0 4601. 186. 1038.
## 4 LT sick or disabled 54 2578. 2007. 2129. 0 13241. 1422 3160.
## 5 On furlough 14 4500. 3525. 4072. 622. 16958. 2297. 5207.
## 6 Paid employment(ft/pt) 605 3943. 3544. 2298. 0 13241. 2200 5023.
## 7 Retired 660 2961. 2474. 1982. 309. 12932. 1625. 3708.
## 8 Self employed 80 3821. 3290. 3015. 0 13538. 1944. 4733.
## 9 Unemployed 52 1983. 1470. 1950. 0 9457. 763. 2484.
## 10 Unpaid, family business 3 2578. 2106. 954. 1952. 3675. 2029. 2891.
## 11 refusal 1 3561. 3561. NA 3561. 3561. 3561. 3561.
# Filter data
North_West_data <- data %>%
filter(jkl_gor_dv == "North West")
# Divide the categories into two subsets
categories <- unique(North_West_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
North_West_data_1 <- North_West_data %>%
filter(jkl_jbstat %in% categories_1)
North_West_2 <- North_West_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(North_West_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in North West (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 30000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(North_West_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in North West (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
print(plot2)
# Calculate summary statistics
summary_statistics <- North_West_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 14 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 12 2694. 1886. 2233. 677. 7572. 1089. 3579.
## 2 Family care or home 42 2614. 2468. 1351. 228. 7027. 1772. 3102.
## 3 Full-time student 11 1896. 1833. 1135. 0 4442. 1502. 2311.
## 4 LT sick or disabled 75 2145. 1878. 1226. 0 6016. 1331. 2710.
## 5 On apprenticeship 2 851. 851. 284. 650 1052. 751. 952.
## 6 On furlough 11 2617. 1619. 2959. 0 10471. 1131. 3301.
## 7 Paid employment(ft/pt) 722 3839. 3353. 2459. 0 23588. 2298. 4793.
## 8 Retired 766 2847. 2414. 1713. 0 13795. 1670. 3570.
## 9 Self employed 118 3627. 3017. 2796. 0 14735. 1795. 4658.
## 10 Temporarily laid off/short… 1 766. 766. NA 766. 766. 766. 766.
## 11 Unemployed 50 1531. 953. 1473. 0 6654. 451. 2215.
## 12 Unpaid, family business 1 1469. 1469. NA 1469. 1469. 1469. 1469.
## 13 don't know 1 520 520 NA 520 520 520 520
## 14 refusal 1 791. 791. NA 791. 791. 791. 791.
# Filter data
Yorkshire_and_The_Humber_data <- data %>%
filter(jkl_gor_dv == "Yorkshire and The Humber")
# Divide the categories into two subsets
categories <- unique(Yorkshire_and_The_Humber_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
Yorkshire_and_The_Humber_data_1 <- Yorkshire_and_The_Humber_data %>%
filter(jkl_jbstat %in% categories_1)
Yorkshire_and_The_Humber_2 <- Yorkshire_and_The_Humber_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(Yorkshire_and_The_Humber_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Yorkshire and The Humber (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 30000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(Yorkshire_and_The_Humber_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Yorkshire and The Humber (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
print(plot2)
# Calculate summary statistics
summary_statistics <- Yorkshire_and_The_Humber_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 14 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 9 2654. 2400. 1971. 0 5.70e3 1000 3997
## 2 Family care or home 46 3321. 2887. 1865. 8.52e+2 8.03e3 1768. 4632.
## 3 Full-time student 16 2467. 2451. 1873. 1.08e+2 5.52e3 643. 4062.
## 4 LT sick or disabled 68 2002. 1765 1166. 0 6.33e3 1229. 2661.
## 5 On apprenticeship 1 3002. 3002. NA 3.00e+3 3.00e3 3002. 3002.
## 6 On furlough 18 3179. 3316. 1571. 1.17e+3 5.56e3 1447. 4560.
## 7 On maternity leave 2 4816. 4816. 1703. 3.61e+3 6.02e3 4214. 5418.
## 8 Paid employment(ft/pt) 614 3688. 3354. 4412. 0 1.04e5 2349. 4591.
## 9 Retired 669 2687. 2264. 1841. 0 1.78e4 1507. 3419
## 10 Self employed 95 4258. 2558. 10648. 8.00e-2 1.04e5 1481. 4297
## 11 Temporarily laid off/sh… 1 7979. 7979. NA 7.98e+3 7.98e3 7979. 7979.
## 12 Unemployed 52 2248. 1961. 1629. 0 5.97e3 1083. 3047.
## 13 don't know 2 1138. 1138. 0 1.14e+3 1.14e3 1138. 1138.
## 14 refusal 2 3178. 3178. 1301. 2.26e+3 4.10e3 2718. 3638.
# Filter data
Scotland_data <- data %>%
filter(jkl_gor_dv == "Scotland")
# Divide the categories into two subsets
categories <- unique(Scotland_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
Scotland_data_1 <- Scotland_data %>%
filter(jkl_jbstat %in% categories_1)
Scotland_2 <- Scotland_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(Scotland_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Scotland (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 40000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(Scotland_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Scotland (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 40000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
print(plot2)
# Calculate summary statistics
summary_statistics <-Scotland_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 14 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 6 1740. 1635. 1430. 315. 4179. 673. 2150.
## 2 Family care or home 23 3156. 2951. 2070. 141. 8957 1799. 4138.
## 3 Full-time student 21 2886. 1680 4213. 155. 19036. 731. 2611.
## 4 Govt training scheme 1 1635. 1635. NA 1635. 1635. 1635. 1635.
## 5 LT sick or disabled 80 2400. 1995. 2639. 0 23136. 1218. 2858.
## 6 On furlough 16 2957. 3155. 1718. 200 7386. 1512. 3796.
## 7 On maternity leave 5 3688. 3104. 1659. 2102 6192. 2563. 4476.
## 8 Paid employment(ft/pt) 691 3709. 3312. 2409. 0 30812. 2204. 4727.
## 9 Retired 779 2740. 2271. 1951. 0 14223. 1501. 3318.
## 10 Self employed 109 3318. 2636. 2313. 0 10658. 1667. 4629.
## 11 Temporarily laid off/… 3 12389. 4464. 16007. 1891. 30812. 3178. 17638.
## 12 Unemployed 34 1660. 1239. 1181. 0 4844. 888. 2377.
## 13 Unpaid, family busine… 1 4710. 4710. NA 4710. 4710. 4710. 4710.
## 14 refusal 1 11364. 11364. NA 11364. 11364. 11364. 11364.
# Filter data
East_Midlands_data <- data %>%
filter(jkl_gor_dv == "East Midlands")
# Divide the categories into two subsets
categories <- unique(East_Midlands_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
East_Midlands_data_1 <- East_Midlands_data %>%
filter(jkl_jbstat %in% categories_1)
East_Midlands_2 <- East_Midlands_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(East_Midlands_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in East Midlands (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 40000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(East_Midlands_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in East Midlands (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 15000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
print(plot2)
# Calculate summary statistics
summary_statistics <-East_Midlands_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 12 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 8 1879. 1080. 1696. 170 4560. 742. 3238.
## 2 Family care or home 26 3116. 2434. 2466. 16.1 11547. 1564. 4226.
## 3 Full-time student 6 2046. 1617. 939. 1200 3682. 1533. 2397.
## 4 LT sick or disabled 49 2501. 1861. 2320. 260 16112. 1330 3017.
## 5 On furlough 5 2832. 2798. 1066. 1776. 4517. 2082. 2989.
## 6 Paid employment(ft/pt) 598 3902. 3413. 2916. 0 38629. 2196. 4849.
## 7 Retired 605 2754. 2397. 1920. 0 20872. 1596. 3428.
## 8 Self employed 99 4056. 3133 4409. 80.0 38629. 1685. 5266.
## 9 Temporarily laid off/shor… 1 5651. 5651. NA 5651. 5651. 5651. 5651.
## 10 Unemployed 53 1679. 1537. 1276. 0 4988. 712. 2223.
## 11 Unpaid, family business 2 5081. 5081. 604. 4654. 5508. 4867. 5294.
## 12 don't know 1 0 0 NA 0 0 0 0
# Filter data
London_data <- data %>%
filter(jkl_gor_dv == "London")
# Divide the categories into two subsets
categories <- unique(London_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
London_data_1 <- London_data %>%
filter(jkl_jbstat %in% categories_1)
London_2 <- London_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(London_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in London (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 40000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(London_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in London (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 40000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
print(plot2)
# Calculate summary statistics
summary_statistics <-London_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 15 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 9 3442. 3491. 2423. 15.2 8294. 2050. 4550.
## 2 Family care or home 64 4416. 3752. 4092. 0 25357. 1830. 5291.
## 3 Full-time student 13 1921. 1005. 2044. 0 6016. 225. 4104.
## 4 Govt training scheme 1 2722 2722 NA 2722 2722 2722 2722
## 5 LT sick or disabled 75 2578. 1707. 2849. 0 21001. 1204. 2999.
## 6 On furlough 25 3503. 3183. 2208. 1185. 10185. 2009. 4241.
## 7 On maternity leave 1 1299 1299 NA 1299 1299 1299 1299
## 8 Paid employment(ft/pt) 875 4610. 4086. 2920. 0 24474. 2533 5984.
## 9 Retired 581 3329. 2525 2629. 0 21001. 1633. 4317.
## 10 Self employed 195 4952. 3758. 4360. 0 25357. 1919. 6357.
## 11 Temporarily laid off/sho… 5 1865. 2219. 1243. 80 3183. 1170 2671.
## 12 Unemployed 97 2402. 1813. 2779. 0 19854. 665 3221.
## 13 Unpaid, family business 2 2529. 2529. 1738. 1300 3758. 1915. 3144.
## 14 don't know 3 6620. 9918. 5712. 24.9 9918. 4971. 9918.
## 15 refusal 8 9185. 5838. 9147. 24.9 19854. 1624. 19854.
# Filter data
South_East_data <- data %>%
filter(jkl_gor_dv == "South East")
# Divide the categories into two subsets
categories <- unique(South_East_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
South_East_data_1 <- South_East_data %>%
filter(jkl_jbstat %in% categories_1)
South_East_2 <- South_East_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(South_East_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in South East (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 30000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(South_East_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in South East (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 50000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
print(plot2)
# Calculate summary statistics
summary_statistics <-South_East_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 14 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 13 3028. 2732. 2061. 0 8992. 2117. 3150.
## 2 Family care or home 44 3911. 3118. 3111. 0 15111. 1755. 4935.
## 3 Full-time student 7 1985. 2040. 1579. 0 5024. 1152. 2265.
## 4 LT sick or disabled 53 2343. 1970. 1381. 5.42e2 6682. 1415. 3161.
## 5 On apprenticeship 1 3342. 3342. NA 3.34e3 3342. 3342. 3342.
## 6 On furlough 17 3089. 3122. 1126. 7.24e2 5402. 2752. 3606.
## 7 On maternity leave 5 4525. 2025. 6381. 8.76e2 15906. 1694 2126.
## 8 Paid employment(ft/pt) 981 4287. 3831. 2637. -3.31e3 18698. 2508. 5498.
## 9 Retired 1047 3275. 2804. 2375. 3.84e0 19684. 1726. 3980.
## 10 Self employed 203 4239. 3658. 3295. -3.31e3 19684. 2055. 5229.
## 11 Temporarily laid off/sho… 2 3441. 3441. 1489. 2.39e3 4494. 2914. 3967.
## 12 Unemployed 50 2213. 1377. 2236. 0 9767. 743. 3492.
## 13 Unpaid, family business 1 2448. 2448. NA 2.45e3 2448. 2448. 2448.
## 14 refusal 1 1169. 1169. NA 1.17e3 1169. 1169. 1169.
# Filter data
South_West_data <- data %>%
filter(jkl_gor_dv == "South West")
# Divide the categories into two subsets
categories <- unique(South_West_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
South_West_data_1 <- South_West_data %>%
filter(jkl_jbstat %in% categories_1)
South_West_2 <- South_West_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(South_West_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in South West (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 30000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(South_West_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in South West (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 30000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
print(plot2)
# Calculate summary statistics
summary_statistics <-South_West_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 13 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 8 3106. 3353. 1475. 6.8 e2 5653. 2315. 3674.
## 2 Family care or home 25 3473. 2753. 2819. 0 10526. 1296. 5175.
## 3 Full-time student 12 1741. 1091. 1691. 0 5054. 471. 2715.
## 4 LT sick or disabled 47 2550. 2023. 1457. 5.45e2 6549. 1625. 2897.
## 5 On apprenticeship 2 3482. 3482. 908. 2.84e3 4124. 3161. 3803.
## 6 On furlough 11 2914. 3023. 1340. 9.58e2 4982. 1765. 3774.
## 7 Paid employment(ft/pt) 653 3927. 3447. 2460. 0 23920. 2300 4980.
## 8 Retired 748 2838. 2474. 1898. 2.92e0 23125. 1610. 3476.
## 9 Self employed 155 4128. 2831. 4167. 4.17e1 23125. 1728. 4672.
## 10 Temporarily laid off/shor… 4 2580. 2089. 2633. 8.38e1 6057. 838. 3831.
## 11 Unemployed 35 1793. 1908. 829. 3.23e2 3597. 1130. 2329.
## 12 Unpaid, family business 1 8451. 8451. NA 8.45e3 8451. 8451. 8451.
## 13 refusal 1 9819. 9819. NA 9.82e3 9819. 9819. 9819.
# Filter data
East_of_England_data <- data %>%
filter(jkl_gor_dv == "East of England")
# Divide the categories into two subsets
categories <- unique(East_of_England_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
East_of_England_data_1 <- East_of_England_data %>%
filter(jkl_jbstat %in% categories_1)
East_of_England_2 <- East_of_England_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(East_of_England_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in East of England (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 30000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(East_of_England_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in East of England (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 30000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
print(plot2)
# Calculate summary statistics
summary_statistics <-East_of_England_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 10 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 3 1012. 1274. 909. 0 1761. 637. 1517.
## 2 Family care or home 32 3917. 2495. 3326. 60 14096. 1804. 5020.
## 3 Full-time student 10 2217. 1961. 1606. 249. 5087. 1127. 2689.
## 4 LT sick or disabled 44 2018. 1871. 1186. 414. 5059. 1094. 2506.
## 5 On furlough 8 3390. 3120. 2388. 573. 8238. 1726. 4203.
## 6 Paid employment(ft/pt) 638 4094. 3899. 2069. 0 12930. 2495. 5278.
## 7 Retired 744 2988. 2450. 2046. 0 15230. 1598. 3711.
## 8 Self employed 133 3665. 3453. 2434. 1.67 14096. 1904. 4772.
## 9 Unemployed 28 2181. 1920. 1885. 5.57 7156. 792. 3087.
## 10 refusal 2 3478. 3478. 781. 2925. 4030. 3201. 3754.
# Filter data
Wales_data <- data %>%
filter(jkl_gor_dv == "Wales")
# Divide the categories into two subsets
categories <- unique(Wales_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
Wales_data_1 <- Wales_data %>%
filter(jkl_jbstat %in% categories_1)
Wales_2 <- Wales_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(Wales_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Wales (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(Wales_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Wales (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
print(plot2)
# Calculate summary statistics
summary_statistics <-Wales_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 13 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 7 2918. 2712. 1770. 1007. 5780. 1619. 3843.
## 2 Family care or home 20 2817. 3062. 1393. 52.5 5923. 1873. 3531.
## 3 Full-time student 6 2454. 2325. 1543. 933. 4502. 1124. 3518.
## 4 LT sick or disabled 50 2777. 2332. 1415. 784 7087. 1821. 3584.
## 5 On furlough 10 2941. 2725. 1647. 558. 6757. 2605. 3420.
## 6 Paid employment(ft/pt) 472 3598. 3343. 1906. 498 12391. 2245. 4434.
## 7 Retired 605 2600. 2279. 1493. 0 8778. 1535. 3403.
## 8 Self employed 80 3483. 3251. 2141. 52.5 13255. 2119. 4286.
## 9 Temporarily laid off/shor… 1 3562. 3562. NA 3562. 3562. 3562. 3562.
## 10 Unemployed 29 1450. 1540. 988. 54.2 4094. 498 2028.
## 11 Unpaid, family business 1 4704. 4704. NA 4704. 4704. 4704. 4704.
## 12 don't know 1 1400 1400 NA 1400 1400 1400 1400
## 13 refusal 1 3951. 3951. NA 3951. 3951. 3951. 3951.
# Filter data
Northern_Ireland_data <- data %>%
filter(jkl_gor_dv == "Northern Ireland")
# Divide the categories into two subsets
categories <- unique(Northern_Ireland_data$jkl_jbstat)
half <- ceiling(length(categories) / 2)
categories_1 <- categories[1:half]
categories_2 <- categories[(half + 1):length(categories)]
# Subset data
Northern_Ireland_data_1 <- Northern_Ireland_data %>%
filter(jkl_jbstat %in% categories_1)
Northern_Ireland_2 <- Northern_Ireland_data %>%
filter(jkl_jbstat %in% categories_2)
# Generate the first box plot
plot1 <- ggplot(Northern_Ireland_data_1, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Northern Ireland (Part 1)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 40000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Generate the second box plot
plot2 <- ggplot(Northern_Ireland_2, aes(x = jkl_jbstat, y = jkl_fihhmnnet1_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = 'Income Distribution by Employment Status in Northern Ireland (Part 2)',
x = 'Employment Status',
y = 'Total Estimated Net Monthly Income (jkl_fihhmnnet1_dv)',
fill = 'Employment Status'
) +
scale_y_continuous(
limits = c(0, 20000),
breaks = seq(0, 120000, by = 15000),
labels = scales::comma,
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
legend.position = "none" # Hide legend as fill colors already labeled on x-axis
)
# Print the plots
print(plot1)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
print(plot2)
# Calculate summary statistics
summary_statistics <-Northern_Ireland_data %>%
group_by(jkl_jbstat) %>%
summarise(
count = n(),
mean = mean(jkl_fihhmnnet1_dv, na.rm = TRUE),
median = median(jkl_fihhmnnet1_dv, na.rm = TRUE),
sd = sd(jkl_fihhmnnet1_dv, na.rm = TRUE),
min = min(jkl_fihhmnnet1_dv, na.rm = TRUE),
max = max(jkl_fihhmnnet1_dv, na.rm = TRUE),
Q1 = quantile(jkl_fihhmnnet1_dv, 0.25, na.rm = TRUE),
Q3 = quantile(jkl_fihhmnnet1_dv, 0.75, na.rm = TRUE)
)
# Print the summary statistics
print(summary_statistics)
## # A tibble: 11 × 9
## jkl_jbstat count mean median sd min max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Doing something else 11 2722. 1900. 2145. 240 6968. 1077. 4167.
## 2 Family care or home 24 3026. 2564. 1704. 1136. 7957. 1798. 3821.
## 3 Full-time student 1 2173. 2173. NA 2173. 2173. 2173. 2173.
## 4 LT sick or disabled 88 2199. 1968. 1387. 0 9967. 1285. 2837.
## 5 On furlough 7 2995. 2757. 1205. 1427. 4626. 2184. 3896.
## 6 On maternity leave 1 1683. 1683. NA 1683. 1683. 1683. 1683.
## 7 Paid employment(ft/pt) 403 3789. 3362. 2217. 0 21825. 2174. 4957
## 8 Retired 504 2781. 2275. 2196. 0 23362. 1504. 3363.
## 9 Self employed 70 3585. 2799. 3269. -794. 21825. 1543. 4686.
## 10 Unemployed 36 1590. 1090. 1306. 0 4472. 636. 2301.
## 11 Unpaid, family business 1 2481. 2481. NA 2481. 2481. 2481. 2481.
Examining the relationship between Urban or Rural Area (jkl_urban_dv) with Household Composition (jkl_hhtype_dv) and Income is also crucial. Urban areas typically have higher living costs but more job opportunities, while rural areas might have lower costs but fewer employment options. Understanding this relationship helps in designing policies that address the unique economic needs of urban and rural households, such as urban housing affordability and rural job creation programs.
# Filter out missing category
filtered_data <- data %>% filter(jkl_urban_dv != "missing")
# Define the groups for the household composition
group1 <- c("1 adult under pensionable age, no children", "1 adult, 1 child", "1 adult, 2 or more children")
group2 <- c("1 female, age 60+, no children", "1 male, aged 65+, no children", "2 adults, not a couple, 1 or more children")
group3 <- c("2 adults, not a couple, both under pensionable age, no children", "2 adults, not a couple, one or more over pensionable age, no children", "3 or more adults, 1 or more children, excl. any couples")
group4 <- c("3 or more adults, 1-2 children, incl. at least one couple", "3 or more adults, no children, excl. any couples", "3 or more adults, no children, incl. at least one couple", "Couple 1 or more over pensionable age,no children", "Couple both under pensionable age, no children", "Couple with 1 child", "Couple with 2 children")
# Create plots for each group
plot_group <- function(data, group, title) {
ggplot(data %>% filter(jkl_hhtype_dv %in% group),
aes(x = jkl_urban_dv, y = jkl_fimnnet_dv, fill = jkl_hhtype_dv)) +
geom_boxplot() +
labs(
title = title,
x = 'Urban or Rural Area (jkl_urban_dv)',
y = 'Total Estimated Net Monthly Income (jkl_fimnnet_dv)',
fill = 'Household Composition'
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
# Plotting each group
plot1 <- plot_group(filtered_data, group1, 'Group 1: Single Adults and Single Parents')
plot2 <- plot_group(filtered_data, group2, 'Group 2: Older Adults and Non-couple Adults with Children')
plot3 <- plot_group(filtered_data, group3, 'Group 3: Non-couple Adults and Large Households with Children')
plot4 <- plot_group(filtered_data, group4, 'Group 4: Couples and Large Households without Children')
# Print the plots
print(plot1)
print(plot2)
print(plot3)
print(plot4)
The relationship between Ethnic Group (jkl_ethn_dv) and Employment Status and Income helps provide an understanding of how employment status and income vary by ethnic group. This can highlight economic inequalities and potential discrimination in the labor market. This relationship is crucial for developing policies aimed at promoting equal opportunities and reducing economic disparities among different ethnic groups.
# Filter out missing category for ethnic group if needed
filtered_data <- data %>% filter(jkl_ethn_dv != "missing")
# Define the groups for the ethnic groups and employment statuses
ethnic_groups <- list(
group1 = c("african", "arab", "bangladeshi", "caribbean", "chinese"),
group2 = c("any other asian background", "any other black background", "any other ethnic group", "any other mixed background", "any other white background"),
group3 = c("british/english/scottish/welsh/northern irish", "gypsy or irish traveller", "indian", "irish", "pakistani"),
group4 = c("white and asian", "white and black african", "white and black caribbean")
)
employment_statuses <- list(
group1 = c("Paid employment(ft/pt)", "Self employed"),
group2 = c("Unemployed", "Retired"),
group3 = c("Student", "Homemaker", "On maternity leave", "On furlough", "On apprenticeship", "LT sick or disabled", "Govt training scheme"),
group4 = c("Family care or home", "Temporarily laid off/short term working", "On other scheme", "Other inactive")
)
# Function to create plots for each combination of ethnic groups and employment statuses
plot_combination <- function(data, ethnic_group, employment_group, title) {
ggplot(data %>% filter(jkl_ethn_dv %in% ethnic_group & jkl_jbstat %in% employment_group),
aes(x = jkl_ethn_dv, y = jkl_fimnnet_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = title,
x = 'Ethnic Group (jkl_ethn_dv)',
y = 'Total Estimated Net Monthly Income (jkl_fimnnet_dv)',
fill = 'Employment Status'
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
# Create a nested loop to generate plots for all combinations
plot_list <- list()
plot_number <- 1
for (i in seq_along(ethnic_groups)) {
for (j in seq_along(employment_statuses)) {
title <- paste('Group', plot_number, ': Ethnic Group', i, 'and Employment Status', j)
plot_list[[plot_number]] <- plot_combination(filtered_data, ethnic_groups[[i]], employment_statuses[[j]], title)
plot_number <- plot_number + 1
}
}
# Print the plots
for (plot in plot_list) {
print(plot)
}
The relationships between Socio-economic Classification (jkl_jbsoc00_cc, jkl_jbnssec8_dv, jkl_jbnssec5_dv, jkl_jbnssec3_dv) and Income and Employment Status help to understand how social class and occupational status affect income levels and employment opportunities. Higher socio-economic classes generally have better access to high-paying jobs and economic stability. This information is valuable for assessing social mobility and the effectiveness of policies aimed at reducing economic inequality.
# Filter out missing category for NS-SEC if needed
filtered_data <- data %>% filter(jkl_jbnssec3_dv != "missing")
# Define the groups for the employment statuses
group1 <- c("Paid employment(ft/pt)", "Self employed")
group2 <- c("Unemployed", "Retired")
group3 <- c("Student", "Homemaker", "On maternity leave", "On furlough", "On apprenticeship", "LT sick or disabled", "Govt training scheme")
group4 <- c("Family care or home", "Temporarily laid off/short term working", "On other scheme", "Other inactive")
# Create plots for each group
plot_group <- function(data, group, title) {
ggplot(data %>% filter(jkl_jbstat %in% group),
aes(x = jkl_jbnssec3_dv, y = jkl_fimnnet_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = title,
x = 'NS-SEC Three-Class (jkl_jbnssec3_dv)',
y = 'Total Estimated Net Monthly Income (jkl_fimnnet_dv)',
fill = 'Employment Status'
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
# Plotting each group
plot1 <- plot_group(filtered_data, group1, 'Group 1: Paid Employment and Self Employed')
plot2 <- plot_group(filtered_data, group2, 'Group 2: Unemployed and Retired')
plot3 <- plot_group(filtered_data, group3, 'Group 3: Students, Homemakers, and Others')
plot4 <- plot_group(filtered_data, group4, 'Group 4: Family Care, Temporarily Laid Off, and Other Inactive')
# Print the plots
print(plot1)
print(plot2)
print(plot3)
print(plot4)
The implications of how Health and Well-being (jkl_sf12mcs_dv, jkl_sf12pcs_dv, jkl_health, jkl_scghq1_dv, jkl_scghq2_dv) and Employment Status and Income interact are also eye-opening. Poor health can lead to lower employment rates and income levels due to reduced work capacity and higher healthcare costs. Analyzing these relationships can highlight the economic impact of health issues and the importance of healthcare policies that support economic productivity and well-being.
# Filter out missing category for health status if needed
filtered_data <- data %>% filter(jkl_health != "missing")
# Define the groups for the employment statuses
group1 <- c("Paid employment(ft/pt)", "Self employed")
group2 <- c("Unemployed", "Retired")
group3 <- c("Student", "Homemaker", "On maternity leave", "On furlough", "On apprenticeship", "LT sick or disabled", "Govt training scheme")
group4 <- c("Family care or home", "Temporarily laid off/short term working", "On other scheme", "Other inactive")
# Create box plots for each group
plot_group <- function(data, group, title) {
ggplot(data %>% filter(jkl_jbstat %in% group),
aes(x = jkl_health, y = jkl_fimnnet_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = title,
x = 'Health Status (jkl_health)',
y = 'Total Estimated Net Monthly Income (jkl_fimnnet_dv)',
fill = 'Employment Status'
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
# Plotting each group
plot1 <- plot_group(filtered_data, group1, 'Group 1: Paid Employment and Self Employed')
plot2 <- plot_group(filtered_data, group2, 'Group 2: Unemployed and Retired')
plot3 <- plot_group(filtered_data, group3, 'Group 3: Students, Homemakers, and Others')
plot4 <- plot_group(filtered_data, group4, 'Group 4: Family Care, Temporarily Laid Off, and Other Inactive')
# Print the plots
print(plot1)
print(plot2)
print(plot3)
print(plot4)
The interaction between Household Composition and Size (jkl_hhsize, jkl_nkids_dv, jkl_hhtype_dv) with Income shows how larger households or those with more children may have different economic needs and financial pressures compared to smaller households. Understanding these relationships can help in designing social support programs, such as child benefits or family tax credits, to ensure economic stability and well-being for different household types.
# Filter out missing category for household type if needed
filtered_data <- data %>% filter(jkl_hhtype_dv != "missing")
# Define the groups for the household types
group1 <- c("1 adult under pensionable age, no children", "1 adult, 1 child", "1 adult, 2 or more children")
group2 <- c("1 female, age 60+, no children", "1 male, aged 65+, no children", "2 adults, not a couple, 1 or more children")
group3 <- c("2 adults, not a couple, both under pensionable age, no children", "2 adults, not a couple, one or more over pensionable age, no children", "3 or more adults, 1 or more children, excl. any couples")
group4 <- c("3 or more adults, 1-2 children, incl. at least one couple", "3 or more adults, no children, excl. any couples", "3 or more adults, no children, incl. at least one couple", "Couple 1 or more over pensionable age, no children", "Couple both under pensionable age, no children", "Couple with 1 child", "Couple with 2 children")
# Create box plots for each group
plot_group <- function(data, group, title) {
ggplot(data %>% filter(jkl_hhtype_dv %in% group),
aes(x = jkl_hhtype_dv, y = jkl_fimnnet_dv, fill = jkl_hhtype_dv)) +
geom_boxplot() +
labs(
title = title,
x = 'Household Composition (jkl_hhtype_dv)',
y = 'Total Estimated Net Monthly Income (jkl_fimnnet_dv)',
fill = 'Household Composition'
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
# Plotting each group
plot1 <- plot_group(filtered_data, group1, 'Group 1: Single Adults and Single Parents')
plot2 <- plot_group(filtered_data, group2, 'Group 2: Older Adults and Non-couple Adults with Children')
plot3 <- plot_group(filtered_data, group3, 'Group 3: Non-couple Adults and Large Households with Children')
plot4 <- plot_group(filtered_data, group4, 'Group 4: Couples and Large Households without Children')
# Print the plots
print(plot1)
print(plot2)
print(plot3)
print(plot4)
The interaction between housing Tenure (jkl_tenure_dv) with Income and Employment Status is crucial, especially since homeownership is often associated with greater financial stability and wealth accumulation, while renting can be more common among lower-income and less stable employment groups. Analyzing this relationship can inform housing policies aimed at promoting homeownership and addressing rental market challenges.
# Filter out missing category for housing tenure if needed
filtered_data <- data %>% filter(jkl_tenure_dv != "missing")
# Define the groups for the employment statuses
employment_statuses <- list(
group1 = c("Paid employment(ft/pt)", "Self employed"),
group2 = c("Unemployed", "Retired"),
group3 = c("Student", "Homemaker", "On maternity leave", "On furlough", "On apprenticeship", "LT sick or disabled", "Govt training scheme"),
group4 = c("Family care or home", "Temporarily laid off/short term working", "On other scheme", "Other inactive")
)
# Define the groups for the housing tenure
housing_tenures <- list(
group1 = c("Housing assoc rented", "Local authority rent"),
group2 = c("Owned outright", "Owned with mortgage"),
group3 = c("Rented from employer", "Rented private furnished", "Rented private unfurnished"),
group4 = c("Other", "missing")
)
# Create box plots for each combination of employment status and housing tenure
plot_combination <- function(data, tenure_group, status_group, title) {
ggplot(data %>% filter(jkl_tenure_dv %in% tenure_group & jkl_jbstat %in% status_group),
aes(x = jkl_tenure_dv, y = jkl_fimnnet_dv, fill = jkl_jbstat)) +
geom_boxplot() +
labs(
title = title,
x = 'Housing Tenure (jkl_tenure_dv)',
y = 'Total Estimated Net Monthly Income (jkl_fimnnet_dv)',
fill = 'Employment Status'
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
# Create and store plots for each combination of groups
plot_list <- list()
plot_number <- 1
for (i in seq_along(housing_tenures)) {
for (j in seq_along(employment_statuses)) {
title <- paste('Plot', plot_number, ': Housing Tenure Group', i, 'and Employment Status Group', j)
plot_list[[plot_number]] <- plot_combination(filtered_data, housing_tenures[[i]], employment_statuses[[j]], title)
plot_number <- plot_number + 1
}
}
# Print the plots
for (plot in plot_list) {
print(plot)
}